https://github.com/jdoerfert updated https://github.com/llvm/llvm-project/pull/94549
>From 36618e65d94ffa3e83464b7d19ff6cd7d5855abf Mon Sep 17 00:00:00 2001 From: Johannes Doerfert <johan...@jdoerfert.de> Date: Wed, 5 Jun 2024 16:51:51 -0700 Subject: [PATCH 1/3] [Offload][NFCI] Initialize the KernelArgsTy to default values --- offload/include/Shared/APITypes.h | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/offload/include/Shared/APITypes.h b/offload/include/Shared/APITypes.h index e8fc27785b6c2..fd315c6b992b9 100644 --- a/offload/include/Shared/APITypes.h +++ b/offload/include/Shared/APITypes.h @@ -89,22 +89,26 @@ struct __tgt_async_info { /// This struct contains all of the arguments to a target kernel region launch. struct KernelArgsTy { - uint32_t Version; // Version of this struct for ABI compatibility. - uint32_t NumArgs; // Number of arguments in each input pointer. - void **ArgBasePtrs; // Base pointer of each argument (e.g. a struct). - void **ArgPtrs; // Pointer to the argument data. - int64_t *ArgSizes; // Size of the argument data in bytes. - int64_t *ArgTypes; // Type of the data (e.g. to / from). - void **ArgNames; // Name of the data for debugging, possibly null. - void **ArgMappers; // User-defined mappers, possibly null. - uint64_t Tripcount; // Tripcount for the teams / distribute loop, 0 otherwise. + uint32_t Version = 0; // Version of this struct for ABI compatibility. + uint32_t NumArgs = 0; // Number of arguments in each input pointer. + void **ArgBasePtrs = + nullptr; // Base pointer of each argument (e.g. a struct). + void **ArgPtrs = nullptr; // Pointer to the argument data. + int64_t *ArgSizes = nullptr; // Size of the argument data in bytes. + int64_t *ArgTypes = nullptr; // Type of the data (e.g. to / from). + void **ArgNames = nullptr; // Name of the data for debugging, possibly null. + void **ArgMappers = nullptr; // User-defined mappers, possibly null. + uint64_t Tripcount = + 0; // Tripcount for the teams / distribute loop, 0 otherwise. struct { uint64_t NoWait : 1; // Was this kernel spawned with a `nowait` clause. uint64_t Unused : 63; - } Flags; - uint32_t NumTeams[3]; // The number of teams (for x,y,z dimension). - uint32_t ThreadLimit[3]; // The number of threads (for x,y,z dimension). - uint32_t DynCGroupMem; // Amount of dynamic cgroup memory requested. + } Flags = {0, 0}; + uint32_t NumTeams[3] = {0, 0, + 0}; // The number of teams (for x,y,z dimension). + uint32_t ThreadLimit[3] = {0, 0, + 0}; // The number of threads (for x,y,z dimension). + uint32_t DynCGroupMem = 0; // Amount of dynamic cgroup memory requested. }; static_assert(sizeof(KernelArgsTy().Flags) == sizeof(uint64_t), "Invalid struct size"); >From e593d163cd3a78bd1e64b1dc276ebc7e8baaeb0b Mon Sep 17 00:00:00 2001 From: Johannes Doerfert <johan...@jdoerfert.de> Date: Tue, 11 Jun 2024 01:49:34 -0700 Subject: [PATCH 2/3] [Offload] Use flat array for cuLaunchKernel We already used a flat array of kernel launch parameters for the AMD GPU launch but now we also use this scheme for the NVIDIA GPU launch. The only remaining/required use of the indirection is the host plugin (due ot ffi). This allows to us simplify the use for non-OpenMP kernel launch. --- offload/include/Shared/APITypes.h | 10 +++++++ offload/plugins-nextgen/amdgpu/src/rtl.cpp | 26 +++++++++-------- .../common/include/PluginInterface.h | 14 +++++---- .../common/src/PluginInterface.cpp | 28 +++++++++--------- offload/plugins-nextgen/cuda/src/rtl.cpp | 29 ++++++++++++------- offload/plugins-nextgen/host/src/rtl.cpp | 5 ++-- 6 files changed, 68 insertions(+), 44 deletions(-) diff --git a/offload/include/Shared/APITypes.h b/offload/include/Shared/APITypes.h index fd315c6b992b9..1dd69baa7b578 100644 --- a/offload/include/Shared/APITypes.h +++ b/offload/include/Shared/APITypes.h @@ -116,6 +116,16 @@ static_assert(sizeof(KernelArgsTy) == (8 * sizeof(int32_t) + 3 * sizeof(int64_t) + 4 * sizeof(void **) + 2 * sizeof(int64_t *)), "Invalid struct size"); + +/// Flat array of kernel launch parameters and their total size. +struct KernelLaunchParamsTy { + /// Size of the Data array. + size_t Size = 0; + /// Flat array of kernel parameters. + void *Data = nullptr; + /// Ptrs to the Data entries. Only strictly required for the host plugin. + void **Ptrs = nullptr; +}; } #endif // OMPTARGET_SHARED_API_TYPES_H diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index c6dd954746e4a..43e0bbd85a9d3 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -20,6 +20,7 @@ #include <unistd.h> #include <unordered_map> +#include "Shared/APITypes.h" #include "Shared/Debug.h" #include "Shared/Environment.h" #include "Shared/Utils.h" @@ -558,7 +559,8 @@ struct AMDGPUKernelTy : public GenericKernelTy { /// Launch the AMDGPU kernel function. Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads, - uint64_t NumBlocks, KernelArgsTy &KernelArgs, void *Args, + uint64_t NumBlocks, KernelArgsTy &KernelArgs, + KernelLaunchParamsTy LaunchParams, AsyncInfoWrapperTy &AsyncInfoWrapper) const override; /// Print more elaborate kernel launch info for AMDGPU @@ -2802,9 +2804,10 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { AsyncInfoWrapperTy AsyncInfoWrapper(*this, nullptr); KernelArgsTy KernelArgs = {}; - if (auto Err = AMDGPUKernel.launchImpl(*this, /*NumThread=*/1u, - /*NumBlocks=*/1ul, KernelArgs, - /*Args=*/nullptr, AsyncInfoWrapper)) + if (auto Err = + AMDGPUKernel.launchImpl(*this, /*NumThread=*/1u, + /*NumBlocks=*/1ul, KernelArgs, + KernelLaunchParamsTy{}, AsyncInfoWrapper)) return Err; Error Err = Plugin::success(); @@ -3266,18 +3269,18 @@ struct AMDGPUPluginTy final : public GenericPluginTy { Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads, uint64_t NumBlocks, - KernelArgsTy &KernelArgs, void *Args, + KernelArgsTy &KernelArgs, + KernelLaunchParamsTy LaunchParams, AsyncInfoWrapperTy &AsyncInfoWrapper) const { - const uint32_t KernelArgsSize = KernelArgs.NumArgs * sizeof(void *); - if (ArgsSize < KernelArgsSize) + if (ArgsSize < LaunchParams.Size) return Plugin::error("Mismatch of kernel arguments size"); // The args size reported by HSA may or may not contain the implicit args. // For now, assume that HSA does not consider the implicit arguments when // reporting the arguments of a kernel. In the worst case, we can waste // 56 bytes per allocation. - uint32_t AllArgsSize = KernelArgsSize + ImplicitArgsSize; + uint32_t AllArgsSize = LaunchParams.Size + ImplicitArgsSize; AMDGPUPluginTy &AMDGPUPlugin = static_cast<AMDGPUPluginTy &>(GenericDevice.Plugin); @@ -3302,7 +3305,7 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice, // Initialize implicit arguments. utils::AMDGPUImplicitArgsTy *ImplArgs = reinterpret_cast<utils::AMDGPUImplicitArgsTy *>( - advanceVoidPtr(AllArgs, KernelArgsSize)); + advanceVoidPtr(AllArgs, LaunchParams.Size)); // Initialize the implicit arguments to zero. std::memset(ImplArgs, 0, ImplicitArgsSize); @@ -3310,9 +3313,8 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice, // Copy the explicit arguments. // TODO: We should expose the args memory manager alloc to the common part as // alternative to copying them twice. - if (KernelArgs.NumArgs) - std::memcpy(AllArgs, *static_cast<void **>(Args), - sizeof(void *) * KernelArgs.NumArgs); + if (LaunchParams.Size) + std::memcpy(AllArgs, LaunchParams.Data, LaunchParams.Size); AMDGPUDeviceTy &AMDGPUDevice = static_cast<AMDGPUDeviceTy &>(GenericDevice); diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h index eda6a4fd541e9..37d16ae3a7027 100644 --- a/offload/plugins-nextgen/common/include/PluginInterface.h +++ b/offload/plugins-nextgen/common/include/PluginInterface.h @@ -19,6 +19,7 @@ #include <shared_mutex> #include <vector> +#include "Shared/APITypes.h" #include "Shared/Debug.h" #include "Shared/Environment.h" #include "Shared/EnvironmentVar.h" @@ -265,7 +266,7 @@ struct GenericKernelTy { AsyncInfoWrapperTy &AsyncInfoWrapper) const; virtual Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads, uint64_t NumBlocks, KernelArgsTy &KernelArgs, - void *Args, + KernelLaunchParamsTy LaunchParams, AsyncInfoWrapperTy &AsyncInfoWrapper) const = 0; /// Get the kernel name. @@ -326,11 +327,12 @@ struct GenericKernelTy { private: /// Prepare the arguments before launching the kernel. - void *prepareArgs(GenericDeviceTy &GenericDevice, void **ArgPtrs, - ptrdiff_t *ArgOffsets, uint32_t &NumArgs, - llvm::SmallVectorImpl<void *> &Args, - llvm::SmallVectorImpl<void *> &Ptrs, - KernelLaunchEnvironmentTy *KernelLaunchEnvironment) const; + KernelLaunchParamsTy + prepareArgs(GenericDeviceTy &GenericDevice, void **ArgPtrs, + ptrdiff_t *ArgOffsets, uint32_t &NumArgs, + llvm::SmallVectorImpl<void *> &Args, + llvm::SmallVectorImpl<void *> &Ptrs, + KernelLaunchEnvironmentTy *KernelLaunchEnvironment) const; /// Get the number of threads and blocks for the kernel based on the /// user-defined threads and block clauses. diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp index 913721a15d713..00e12aecf7512 100644 --- a/offload/plugins-nextgen/common/src/PluginInterface.cpp +++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp @@ -268,9 +268,9 @@ struct RecordReplayTy { OS.close(); } - void saveKernelDescr(const char *Name, void **ArgPtrs, int32_t NumArgs, - uint64_t NumTeamsClause, uint32_t ThreadLimitClause, - uint64_t LoopTripCount) { + void saveKernelDescr(const char *Name, KernelLaunchParamsTy LaunchParams, + int32_t NumArgs, uint64_t NumTeamsClause, + uint32_t ThreadLimitClause, uint64_t LoopTripCount) { json::Object JsonKernelInfo; JsonKernelInfo["Name"] = Name; JsonKernelInfo["NumArgs"] = NumArgs; @@ -283,7 +283,7 @@ struct RecordReplayTy { json::Array JsonArgPtrs; for (int I = 0; I < NumArgs; ++I) - JsonArgPtrs.push_back((intptr_t)ArgPtrs[I]); + JsonArgPtrs.push_back((intptr_t)LaunchParams.Ptrs[I]); JsonKernelInfo["ArgPtrs"] = json::Value(std::move(JsonArgPtrs)); json::Array JsonArgOffsets; @@ -549,7 +549,7 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs, if (!KernelLaunchEnvOrErr) return KernelLaunchEnvOrErr.takeError(); - void *KernelArgsPtr = + KernelLaunchParamsTy LaunchParams = prepareArgs(GenericDevice, ArgPtrs, ArgOffsets, KernelArgs.NumArgs, Args, Ptrs, *KernelLaunchEnvOrErr); @@ -564,7 +564,7 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs, if (RecordReplay.isRecording()) { RecordReplay.saveImage(getName(), getImage()); RecordReplay.saveKernelInput(getName(), getImage()); - RecordReplay.saveKernelDescr(getName(), Ptrs.data(), KernelArgs.NumArgs, + RecordReplay.saveKernelDescr(getName(), LaunchParams, KernelArgs.NumArgs, NumBlocks, NumThreads, KernelArgs.Tripcount); } @@ -573,10 +573,10 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs, return Err; return launchImpl(GenericDevice, NumThreads, NumBlocks, KernelArgs, - KernelArgsPtr, AsyncInfoWrapper); + LaunchParams, AsyncInfoWrapper); } -void *GenericKernelTy::prepareArgs( +KernelLaunchParamsTy GenericKernelTy::prepareArgs( GenericDeviceTy &GenericDevice, void **ArgPtrs, ptrdiff_t *ArgOffsets, uint32_t &NumArgs, llvm::SmallVectorImpl<void *> &Args, llvm::SmallVectorImpl<void *> &Ptrs, @@ -585,22 +585,22 @@ void *GenericKernelTy::prepareArgs( NumArgs += KLEOffset; if (NumArgs == 0) - return nullptr; + return KernelLaunchParamsTy{}; Args.resize(NumArgs); Ptrs.resize(NumArgs); if (KernelLaunchEnvironment) { - Ptrs[0] = KernelLaunchEnvironment; - Args[0] = &Ptrs[0]; + Args[0] = KernelLaunchEnvironment; + Ptrs[0] = &Args[0]; } for (uint32_t I = KLEOffset; I < NumArgs; ++I) { - Ptrs[I] = + Args[I] = (void *)((intptr_t)ArgPtrs[I - KLEOffset] + ArgOffsets[I - KLEOffset]); - Args[I] = &Ptrs[I]; + Ptrs[I] = &Args[I]; } - return &Args[0]; + return KernelLaunchParamsTy{sizeof(void *) * NumArgs, &Args[0], &Ptrs[0]}; } uint32_t GenericKernelTy::getNumThreads(GenericDeviceTy &GenericDevice, diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp index b260334baa18b..a8d8846791745 100644 --- a/offload/plugins-nextgen/cuda/src/rtl.cpp +++ b/offload/plugins-nextgen/cuda/src/rtl.cpp @@ -16,6 +16,7 @@ #include <string> #include <unordered_map> +#include "Shared/APITypes.h" #include "Shared/Debug.h" #include "Shared/Environment.h" @@ -149,7 +150,8 @@ struct CUDAKernelTy : public GenericKernelTy { /// Launch the CUDA kernel function. Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads, - uint64_t NumBlocks, KernelArgsTy &KernelArgs, void *Args, + uint64_t NumBlocks, KernelArgsTy &KernelArgs, + KernelLaunchParamsTy LaunchParams, AsyncInfoWrapperTy &AsyncInfoWrapper) const override; private: @@ -1228,9 +1230,10 @@ struct CUDADeviceTy : public GenericDeviceTy { AsyncInfoWrapperTy AsyncInfoWrapper(*this, nullptr); KernelArgsTy KernelArgs = {}; - if (auto Err = CUDAKernel.launchImpl(*this, /*NumThread=*/1u, - /*NumBlocks=*/1ul, KernelArgs, nullptr, - AsyncInfoWrapper)) + if (auto Err = + CUDAKernel.launchImpl(*this, /*NumThread=*/1u, + /*NumBlocks=*/1ul, KernelArgs, + KernelLaunchParamsTy{}, AsyncInfoWrapper)) return Err; Error Err = Plugin::success(); @@ -1274,7 +1277,8 @@ struct CUDADeviceTy : public GenericDeviceTy { Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads, uint64_t NumBlocks, - KernelArgsTy &KernelArgs, void *Args, + KernelArgsTy &KernelArgs, + KernelLaunchParamsTy LaunchParams, AsyncInfoWrapperTy &AsyncInfoWrapper) const { CUDADeviceTy &CUDADevice = static_cast<CUDADeviceTy &>(GenericDevice); @@ -1285,11 +1289,16 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice, uint32_t MaxDynCGroupMem = std::max(KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize()); - CUresult Res = - cuLaunchKernel(Func, NumBlocks, /*gridDimY=*/1, - /*gridDimZ=*/1, NumThreads, - /*blockDimY=*/1, /*blockDimZ=*/1, MaxDynCGroupMem, Stream, - (void **)Args, nullptr); + void *Config[] = {/* CU_LAUNCH_PARAM_BUFFER_POINTER */ (void *)0x01, + LaunchParams.Data, + /* CU_LAUNCH_PARAM_BUFFER_SIZE */ (void *)0x02, + reinterpret_cast<void *>(&LaunchParams.Size), + /* CU_LAUNCH_PARAM_END */ (void *)0x00}; + + CUresult Res = cuLaunchKernel(Func, NumBlocks, /*gridDimY=*/1, + /*gridDimZ=*/1, NumThreads, + /*blockDimY=*/1, /*blockDimZ=*/1, + MaxDynCGroupMem, Stream, nullptr, Config); return Plugin::check(Res, "Error in cuLaunchKernel for '%s': %s", getName()); } diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp index ef84cbaf54588..ef2488e42c15b 100644 --- a/offload/plugins-nextgen/host/src/rtl.cpp +++ b/offload/plugins-nextgen/host/src/rtl.cpp @@ -90,7 +90,8 @@ struct GenELF64KernelTy : public GenericKernelTy { /// Launch the kernel using the libffi. Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads, - uint64_t NumBlocks, KernelArgsTy &KernelArgs, void *Args, + uint64_t NumBlocks, KernelArgsTy &KernelArgs, + KernelLaunchParamsTy LaunchParams, AsyncInfoWrapperTy &AsyncInfoWrapper) const override { // Create a vector of ffi_types, one per argument. SmallVector<ffi_type *, 16> ArgTypes(KernelArgs.NumArgs, &ffi_type_pointer); @@ -105,7 +106,7 @@ struct GenELF64KernelTy : public GenericKernelTy { // Call the kernel function through libffi. long Return; - ffi_call(&Cif, Func, &Return, (void **)Args); + ffi_call(&Cif, Func, &Return, (void **)LaunchParams.Ptrs); return Plugin::success(); } >From 7b334f1de32675c8e382cbe82d571306e0e5dc81 Mon Sep 17 00:00:00 2001 From: Johannes Doerfert <johan...@jdoerfert.de> Date: Mon, 3 Jun 2024 19:52:12 -0700 Subject: [PATCH 3/3] [Offload][CUDA] Allow CUDA kernels to use LLVM/Offload MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Through the new `-foffload-via-llvm` flag, CUDA kernels can now be lowered to the LLVM/Offload API. On the Clang side, this is simply done by using the OpenMP offload toolchain and emitting calls to `llvm*` functions to orchestrate the kernel launch rather than `cuda*` functions. These `llvm*` functions are implemented on top of the existing LLVM/Offload API. As we are about to redefine the Offload API, this wil help us in the design process as a second offload language. We do not support any CUDA APIs yet, however, we could: https://www.osti.gov/servlets/purl/1892137 For proper host execution we need to resurrect/rebase https://tianshilei.me/wp-content/uploads/2021/12/llpp-2021.pdf (which was designed for debugging). ``` ❯❯❯ cat test.cu extern "C" { void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum); void llvm_omp_target_free_shared(void *DevicePtr, int DeviceNum); } __global__ void square(int *A) { *A = 42; } int main(int argc, char **argv) { int DevNo = 0; int *Ptr = reinterpret_cast<int *>(llvm_omp_target_alloc_shared(4, DevNo)); *Ptr = 7; printf("Ptr %p, *Ptr %i\n", Ptr, *Ptr); square<<<1, 1>>>(Ptr); printf("Ptr %p, *Ptr %i\n", Ptr, *Ptr); llvm_omp_target_free_shared(Ptr, DevNo); } ❯❯❯ clang++ test.cu -O3 -o test123 -foffload-via-llvm --offload-arch=native ❯❯❯ llvm-objdump --offloading test123 test123: file format elf64-x86-64 OFFLOADING IMAGE [0]: kind elf arch gfx90a triple amdgcn-amd-amdhsa producer openmp ❯❯❯ LIBOMPTARGET_INFO=16 ./test123 Ptr 0x155448ac8000, *Ptr 7 Ptr 0x155448ac8000, *Ptr 42 ``` --- clang/include/clang/Basic/LangOptions.def | 1 + clang/include/clang/Driver/Options.td | 6 ++ clang/lib/CodeGen/CGCUDANV.cpp | 97 ++++++++++++++++--- clang/lib/Driver/Driver.cpp | 19 ++-- clang/lib/Driver/ToolChains/Clang.cpp | 27 +++++- clang/lib/Driver/ToolChains/CommonArgs.cpp | 7 +- clang/lib/Driver/ToolChains/Cuda.cpp | 27 +++--- clang/lib/Headers/CMakeLists.txt | 18 +++- .../llvm_offload_wrappers/__llvm_offload.h | 31 ++++++ .../__llvm_offload_device.h | 10 ++ .../__llvm_offload_host.h | 15 +++ .../__clang_openmp_device_functions.h | 9 +- clang/lib/Sema/SemaCUDA.cpp | 3 + clang/test/CodeGenCUDA/offload_via_llvm.cu | 97 +++++++++++++++++++ clang/test/Driver/cuda-via-liboffload.cu | 23 +++++ offload/include/Shared/APITypes.h | 5 +- offload/include/omptarget.h | 2 +- .../common/src/PluginInterface.cpp | 13 ++- offload/src/CMakeLists.txt | 1 + offload/src/KernelLanguage/API.cpp | 76 +++++++++++++++ offload/src/exports | 3 + offload/test/lit.cfg | 2 +- offload/test/offloading/CUDA/basic_launch.cu | 31 ++++++ .../CUDA/basic_launch_blocks_and_threads.cu | 32 ++++++ .../offloading/CUDA/basic_launch_multi_arg.cu | 41 ++++++++ offload/test/offloading/CUDA/kernel_tu.cu.inc | 1 + offload/test/offloading/CUDA/launch_tu.cu | 32 ++++++ 27 files changed, 576 insertions(+), 53 deletions(-) create mode 100644 clang/lib/Headers/llvm_offload_wrappers/__llvm_offload.h create mode 100644 clang/lib/Headers/llvm_offload_wrappers/__llvm_offload_device.h create mode 100644 clang/lib/Headers/llvm_offload_wrappers/__llvm_offload_host.h create mode 100644 clang/test/CodeGenCUDA/offload_via_llvm.cu create mode 100644 clang/test/Driver/cuda-via-liboffload.cu create mode 100644 offload/src/KernelLanguage/API.cpp create mode 100644 offload/test/offloading/CUDA/basic_launch.cu create mode 100644 offload/test/offloading/CUDA/basic_launch_blocks_and_threads.cu create mode 100644 offload/test/offloading/CUDA/basic_launch_multi_arg.cu create mode 100644 offload/test/offloading/CUDA/kernel_tu.cu.inc create mode 100644 offload/test/offloading/CUDA/launch_tu.cu diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def index 4061451b2150a..8aff98867202e 100644 --- a/clang/include/clang/Basic/LangOptions.def +++ b/clang/include/clang/Basic/LangOptions.def @@ -288,6 +288,7 @@ LANGOPT(GPUMaxThreadsPerBlock, 32, 1024, "default max threads per block for kern LANGOPT(GPUDeferDiag, 1, 0, "defer host/device related diagnostic messages for CUDA/HIP") LANGOPT(GPUExcludeWrongSideOverloads, 1, 0, "always exclude wrong side overloads in overloading resolution for CUDA/HIP") LANGOPT(OffloadingNewDriver, 1, 0, "use the new driver for generating offloading code.") +LANGOPT(OffloadViaLLVM, 1, 0, "target LLVM/Offload as portable offloading runtime.") LANGOPT(SYCLIsDevice , 1, 0, "Generate code for SYCL device") LANGOPT(SYCLIsHost , 1, 0, "SYCL host compilation") diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 57f37c5023110..a09d75917ff98 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -1275,6 +1275,12 @@ def no_offload_compress : Flag<["--"], "no-offload-compress">; def offload_compression_level_EQ : Joined<["--"], "offload-compression-level=">, Flags<[HelpHidden]>, HelpText<"Compression level for offload device binaries (HIP only)">; + +defm offload_via_llvm : BoolFOption<"offload-via-llvm", + LangOpts<"OffloadViaLLVM">, DefaultFalse, + PosFlag<SetTrue, [], [ClangOption, CC1Option], "Use">, + NegFlag<SetFalse, [], [ClangOption], "Don't use">, + BothFlags<[], [ClangOption], " LLVM/Offload as portable offloading runtime.">>; } // CUDA options diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp index 43dfbbb90dd52..2ebe0bf802dfa 100644 --- a/clang/lib/CodeGen/CGCUDANV.cpp +++ b/clang/lib/CodeGen/CGCUDANV.cpp @@ -15,10 +15,12 @@ #include "CGCXXABI.h" #include "CodeGenFunction.h" #include "CodeGenModule.h" +#include "clang/AST/CharUnits.h" #include "clang/AST/Decl.h" #include "clang/Basic/Cuda.h" #include "clang/CodeGen/CodeGenABITypes.h" #include "clang/CodeGen/ConstantInitBuilder.h" +#include "llvm/ADT/StringRef.h" #include "llvm/Frontend/Offloading/Utility.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" @@ -36,6 +38,11 @@ constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF" class CGNVCUDARuntime : public CGCUDARuntime { + /// The prefix used for function calls and section names (CUDA, HIP, LLVM) + StringRef Prefix; + /// TODO: We should transition the OpenMP section to LLVM/Offload + StringRef SectionPrefix; + private: llvm::IntegerType *IntTy, *SizeTy; llvm::Type *VoidTy; @@ -132,6 +139,9 @@ class CGNVCUDARuntime : public CGCUDARuntime { return DummyFunc; } + Address prepareKernelArgs(CodeGenFunction &CGF, FunctionArgList &Args); + Address prepareKernelArgsLLVMOffload(CodeGenFunction &CGF, + FunctionArgList &Args); void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args); void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args); std::string getDeviceSideName(const NamedDecl *ND) override; @@ -191,15 +201,11 @@ class CGNVCUDARuntime : public CGCUDARuntime { } // end anonymous namespace std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const { - if (CGM.getLangOpts().HIP) - return ((Twine("hip") + Twine(FuncName)).str()); - return ((Twine("cuda") + Twine(FuncName)).str()); + return (Prefix + FuncName).str(); } std::string CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const { - if (CGM.getLangOpts().HIP) - return ((Twine("__hip") + Twine(FuncName)).str()); - return ((Twine("__cuda") + Twine(FuncName)).str()); + return ("__" + Prefix + FuncName).str(); } static std::unique_ptr<MangleContext> InitDeviceMC(CodeGenModule &CGM) { @@ -227,6 +233,14 @@ CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM) SizeTy = CGM.SizeTy; VoidTy = CGM.VoidTy; PtrTy = CGM.UnqualPtrTy; + + if (CGM.getLangOpts().OffloadViaLLVM) { + Prefix = "llvm"; + SectionPrefix = "omp"; + } else if (CGM.getLangOpts().HIP) + SectionPrefix = Prefix = "hip"; + else + SectionPrefix = Prefix = "cuda"; } llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const { @@ -305,18 +319,58 @@ void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF, } if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(), CudaFeature::CUDA_USES_NEW_LAUNCH) || - (CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI)) + (CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI) || + (CGF.getLangOpts().OffloadViaLLVM)) emitDeviceStubBodyNew(CGF, Args); else emitDeviceStubBodyLegacy(CGF, Args); } -// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local -// array and kernels are launched using cudaLaunchKernel(). -void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF, - FunctionArgList &Args) { - // Build the shadow stack entry at the very start of the function. +/// CUDA passes the arguments with a level of indirection. For example, a +/// (void*, short, void*) is passed as {void **, short *, void **} to the launch +/// function. For the LLVM/offload launch we flatten the arguments into the +/// struct directly. In addition, we include the size of the arguments, thus +/// pass {sizeof({void *, short, void *}), ptr to {void *, short, void *}, +/// nullptr}. The last nullptr needs to be initialized to an array of pointers +/// pointing to the arguments if we want to offload to the host. +Address CGNVCUDARuntime::prepareKernelArgsLLVMOffload(CodeGenFunction &CGF, + FunctionArgList &Args) { + SmallVector<llvm::Type *> ArgTypes, KernelLaunchParamsTypes; + for (auto &Arg : Args) + ArgTypes.push_back(CGF.ConvertTypeForMem(Arg->getType())); + llvm::StructType *KernelArgsTy = llvm::StructType::create(ArgTypes); + + auto *Int64Ty = CGF.Builder.getInt64Ty(); + KernelLaunchParamsTypes.push_back(Int64Ty); + KernelLaunchParamsTypes.push_back(PtrTy); + KernelLaunchParamsTypes.push_back(PtrTy); + + llvm::StructType *KernelLaunchParamsTy = + llvm::StructType::create(KernelLaunchParamsTypes); + Address KernelArgs = CGF.CreateTempAllocaWithoutCast( + KernelArgsTy, CharUnits::fromQuantity(16), "kernel_args"); + Address KernelLaunchParams = CGF.CreateTempAllocaWithoutCast( + KernelLaunchParamsTy, CharUnits::fromQuantity(16), + "kernel_launch_params"); + + auto KernelArgsSize = CGM.getDataLayout().getTypeAllocSize(KernelArgsTy); + CGF.Builder.CreateStore(llvm::ConstantInt::get(Int64Ty, KernelArgsSize), + CGF.Builder.CreateStructGEP(KernelLaunchParams, 0)); + CGF.Builder.CreateStore(KernelArgs.emitRawPointer(CGF), + CGF.Builder.CreateStructGEP(KernelLaunchParams, 1)); + CGF.Builder.CreateStore(llvm::Constant::getNullValue(PtrTy), + CGF.Builder.CreateStructGEP(KernelLaunchParams, 2)); + + for (unsigned i = 0; i < Args.size(); ++i) { + auto *ArgVal = CGF.Builder.CreateLoad(CGF.GetAddrOfLocalVar(Args[i])); + CGF.Builder.CreateStore(ArgVal, CGF.Builder.CreateStructGEP(KernelArgs, i)); + } + return KernelLaunchParams; +} + +Address CGNVCUDARuntime::prepareKernelArgs(CodeGenFunction &CGF, + FunctionArgList &Args) { // Calculate amount of space we will need for all arguments. If we have no // args, allocate a single pointer so we still have a valid pointer to the // argument array that we can pass to runtime, even if it will be unused. @@ -331,6 +385,17 @@ void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF, VoidVarPtr, CGF.Builder.CreateConstGEP1_32( PtrTy, KernelArgs.emitRawPointer(CGF), i)); } + return KernelArgs; +} + +// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local +// array and kernels are launched using cudaLaunchKernel(). +void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF, + FunctionArgList &Args) { + // Build the shadow stack entry at the very start of the function. + Address KernelArgs = CGF.getLangOpts().OffloadViaLLVM + ? prepareKernelArgsLLVMOffload(CGF, Args) + : prepareKernelArgs(CGF, Args); llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); @@ -1129,8 +1194,9 @@ void CGNVCUDARuntime::transformManagedVars() { // registered. The linker will provide a pointer to this section so we can // register the symbols with the linked device image. void CGNVCUDARuntime::createOffloadingEntries() { - StringRef Section = CGM.getLangOpts().HIP ? "hip_offloading_entries" - : "cuda_offloading_entries"; + SmallVector<char, 32> Out; + StringRef Section = (SectionPrefix + "_offloading_entries").toStringRef(Out); + llvm::Module &M = CGM.getModule(); for (KernelInfo &I : EmittedKernels) llvm::offloading::emitOffloadingEntry( @@ -1199,7 +1265,8 @@ llvm::Function *CGNVCUDARuntime::finalizeModule() { } return nullptr; } - if (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode) + if (CGM.getLangOpts().OffloadViaLLVM || + (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode)) createOffloadingEntries(); else return makeModuleCtorFunction(); diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp index f5ea73a04ae5c..815149a49d018 100644 --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -792,11 +792,13 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C, }) || C.getInputArgs().hasArg(options::OPT_hip_link) || C.getInputArgs().hasArg(options::OPT_hipstdpar); + bool UseLLVMOffload = C.getInputArgs().hasArg( + options::OPT_foffload_via_llvm, options::OPT_fno_offload_via_llvm, false); if (IsCuda && IsHIP) { Diag(clang::diag::err_drv_mix_cuda_hip); return; } - if (IsCuda) { + if (IsCuda && !UseLLVMOffload) { const ToolChain *HostTC = C.getSingleOffloadToolChain<Action::OFK_Host>(); const llvm::Triple &HostTriple = HostTC->getTriple(); auto OFK = Action::OFK_Cuda; @@ -818,7 +820,7 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C, CudaInstallation.WarnIfUnsupportedVersion(); } C.addOffloadDeviceToolChain(CudaTC.get(), OFK); - } else if (IsHIP) { + } else if (IsHIP && !UseLLVMOffload) { if (auto *OMPTargetArg = C.getInputArgs().getLastArg(options::OPT_fopenmp_targets_EQ)) { Diag(clang::diag::err_drv_unsupported_opt_for_language_mode) @@ -842,10 +844,11 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C, // We need to generate an OpenMP toolchain if the user specified targets with // the -fopenmp-targets option or used --offload-arch with OpenMP enabled. bool IsOpenMPOffloading = - C.getInputArgs().hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ, - options::OPT_fno_openmp, false) && - (C.getInputArgs().hasArg(options::OPT_fopenmp_targets_EQ) || - C.getInputArgs().hasArg(options::OPT_offload_arch_EQ)); + ((IsCuda || IsHIP) && UseLLVMOffload) || + (C.getInputArgs().hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ, + options::OPT_fno_openmp, false) && + (C.getInputArgs().hasArg(options::OPT_fopenmp_targets_EQ) || + C.getInputArgs().hasArg(options::OPT_offload_arch_EQ))); if (IsOpenMPOffloading) { // We expect that -fopenmp-targets is always used in conjunction with the // option -fopenmp specifying a valid runtime with offloading support, i.e. @@ -873,7 +876,7 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C, for (StringRef T : OpenMPTargets->getValues()) OpenMPTriples.insert(T); } else if (C.getInputArgs().hasArg(options::OPT_offload_arch_EQ) && - !IsHIP && !IsCuda) { + ((!IsHIP && !IsCuda) || UseLLVMOffload)) { const ToolChain *HostTC = C.getSingleOffloadToolChain<Action::OFK_Host>(); auto AMDTriple = getHIPOffloadTargetTriple(*this, C.getInputArgs()); auto NVPTXTriple = getNVIDIAOffloadTargetTriple(*this, C.getInputArgs(), @@ -4138,6 +4141,8 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args, bool UseNewOffloadingDriver = C.isOffloadingHostKind(Action::OFK_OpenMP) || + Args.hasFlag(options::OPT_foffload_via_llvm, + options::OPT_fno_offload_via_llvm, false) || Args.hasFlag(options::OPT_offload_new_driver, options::OPT_no_offload_new_driver, false); diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 4e1c52462e584..03196d9e438af 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -1125,6 +1125,18 @@ void Clang::AddPreprocessingOptions(Compilation &C, const JobAction &JA, CmdArgs.push_back("__clang_openmp_device_functions.h"); } + if (Args.hasArg(options::OPT_foffload_via_llvm)) { + // Add llvm_wrappers/* to our system include path. This lets us wrap + // standard library headers and other headers. + SmallString<128> P(D.ResourceDir); + llvm::sys::path::append(P, "include", "llvm_offload_wrappers"); + CmdArgs.append({"-internal-isystem", Args.MakeArgString(P), "-include"}); + if (JA.isDeviceOffloading(Action::OFK_OpenMP)) + CmdArgs.push_back("__llvm_offload_device.h"); + else + CmdArgs.push_back("__llvm_offload_host.h"); + } + // Add -i* options, and automatically translate to // -include-pch/-include-pth for transparent PCH support. It's // wonky, but we include looking for .gch so we can support seamless @@ -6595,6 +6607,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, // device offloading action other than OpenMP. if (Args.hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ, options::OPT_fno_openmp, false) && + !Args.hasFlag(options::OPT_foffload_via_llvm, + options::OPT_fno_offload_via_llvm, false) && (JA.isDeviceOffloading(Action::OFK_None) || JA.isDeviceOffloading(Action::OFK_OpenMP))) { switch (D.getOpenMPRuntime(Args)) { @@ -6672,11 +6686,16 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, Args.addOptOutFlag(CmdArgs, options::OPT_fopenmp_extensions, options::OPT_fno_openmp_extensions); } - - // Forward the new driver to change offloading code generation. - if (Args.hasFlag(options::OPT_offload_new_driver, - options::OPT_no_offload_new_driver, false)) + // Forward the offload runtime change to code generation, liboffload implies + // new driver. Otherwise, check if we should forward the new driver to change + // offloading code generation. + if (Args.hasFlag(options::OPT_foffload_via_llvm, + options::OPT_fno_offload_via_llvm, false)) { + CmdArgs.append({"--offload-new-driver", "-foffload-via-llvm"}); + } else if (Args.hasFlag(options::OPT_offload_new_driver, + options::OPT_no_offload_new_driver, false)) { CmdArgs.push_back("--offload-new-driver"); + } SanitizeArgs.addArgs(TC, Args, CmdArgs, InputType); diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp index 71e993119436a..74a69f65f7ad5 100644 --- a/clang/lib/Driver/ToolChains/CommonArgs.cpp +++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp @@ -1144,8 +1144,13 @@ bool tools::addOpenMPRuntime(const Compilation &C, ArgStringList &CmdArgs, bool ForceStaticHostRuntime, bool IsOffloadingHost, bool GompNeedsRT) { if (!Args.hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ, - options::OPT_fno_openmp, false)) + options::OPT_fno_openmp, false)) { + // We need libomptarget (liboffload) if it's the choosen offloading runtime. + if (Args.hasFlag(options::OPT_foffload_via_llvm, + options::OPT_fno_offload_via_llvm, false)) + CmdArgs.push_back("-lomptarget"); return false; + } Driver::OpenMPRuntimeKind RTKind = TC.getDriver().getOpenMPRuntime(Args); diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp index d5f93c9c830fa..3670f42d2e210 100644 --- a/clang/lib/Driver/ToolChains/Cuda.cpp +++ b/clang/lib/Driver/ToolChains/Cuda.cpp @@ -858,17 +858,15 @@ void CudaToolChain::addClangTargetOptions( DeviceOffloadingKind == Action::OFK_Cuda) && "Only OpenMP or CUDA offloading kinds are supported for NVIDIA GPUs."); - if (DeviceOffloadingKind == Action::OFK_Cuda) { - CC1Args.append( - {"-fcuda-is-device", "-mllvm", "-enable-memcpyopt-without-libcalls"}); - - // Unsized function arguments used for variadics were introduced in CUDA-9.0 - // We still do not support generating code that actually uses variadic - // arguments yet, but we do need to allow parsing them as recent CUDA - // headers rely on that. https://github.com/llvm/llvm-project/issues/58410 - if (CudaInstallation.version() >= CudaVersion::CUDA_90) - CC1Args.push_back("-fcuda-allow-variadic-functions"); - } + CC1Args.append( + {"-fcuda-is-device", "-mllvm", "-enable-memcpyopt-without-libcalls"}); + + // Unsized function arguments used for variadics were introduced in CUDA-9.0 + // We still do not support generating code that actually uses variadic + // arguments yet, but we do need to allow parsing them as recent CUDA + // headers rely on that. https://github.com/llvm/llvm-project/issues/58410 + if (CudaInstallation.version() >= CudaVersion::CUDA_90) + CC1Args.push_back("-fcuda-allow-variadic-functions"); if (DriverArgs.hasArg(options::OPT_nogpulib)) return; @@ -886,6 +884,13 @@ void CudaToolChain::addClangTargetOptions( CC1Args.push_back("-mlink-builtin-bitcode"); CC1Args.push_back(DriverArgs.MakeArgString(LibDeviceFile)); + // For now, we don't use any Offload/OpenMP device runtime when we offload + // CUDA via LLVM/Offload. We should split the Offload/OpenMP device runtime + // and include the "generic" (or CUDA-specific) parts. + if (DriverArgs.hasFlag(options::OPT_foffload_via_llvm, + options::OPT_fno_offload_via_llvm, false)) + return; + clang::CudaVersion CudaInstallationVersion = CudaInstallation.version(); if (DriverArgs.hasFlag(options::OPT_fcuda_short_ptr, diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt index d3090e488306f..9e0eb0f4cde89 100644 --- a/clang/lib/Headers/CMakeLists.txt +++ b/clang/lib/Headers/CMakeLists.txt @@ -325,6 +325,12 @@ set(openmp_wrapper_files openmp_wrappers/new ) +set(llvm_offload_wrapper_files + llvm_offload_wrappers/__llvm_offload.h + llvm_offload_wrappers/__llvm_offload_host.h + llvm_offload_wrappers/__llvm_offload_device.h +) + set(llvm_libc_wrapper_files llvm_libc_wrappers/assert.h llvm_libc_wrappers/stdio.h @@ -375,7 +381,7 @@ endfunction(clang_generate_header) # Copy header files from the source directory to the build directory foreach( f ${files} ${cuda_wrapper_files} ${cuda_wrapper_bits_files} ${ppc_wrapper_files} ${openmp_wrapper_files} ${zos_wrapper_files} ${hlsl_files} - ${llvm_libc_wrapper_files}) + ${llvm_libc_wrapper_files} ${llvm_offload_wrapper_files}) copy_header_to_output_dir(${CMAKE_CURRENT_SOURCE_DIR} ${f}) endforeach( f ) @@ -501,6 +507,7 @@ add_header_target("hlsl-resource-headers" ${hlsl_files}) add_header_target("opencl-resource-headers" ${opencl_files}) add_header_target("llvm-libc-resource-headers" ${llvm_libc_wrapper_files}) add_header_target("openmp-resource-headers" ${openmp_wrapper_files}) +add_header_target("llvm-offload-resource-headers" ${llvm_libc_wrapper_files}) add_header_target("windows-resource-headers" ${windows_only_files}) add_header_target("utility-resource-headers" ${utility_files}) @@ -542,6 +549,11 @@ install( DESTINATION ${header_install_dir}/openmp_wrappers COMPONENT clang-resource-headers) +install( + FILES ${llvm_offload_wrapper_files} + DESTINATION ${header_install_dir}/llvm_offload_wrappers + COMPONENT clang-resource-headers) + install( FILES ${zos_wrapper_files} DESTINATION ${header_install_dir}/zos_wrappers @@ -704,8 +716,8 @@ install( COMPONENT openmp-resource-headers) install( - FILES ${openmp_wrapper_files} - DESTINATION ${header_install_dir}/openmp_wrappers + FILES ${llvm_offload_wrapper_files} + DESTINATION ${header_install_dir}/llvm_offload_wrappers EXCLUDE_FROM_ALL COMPONENT openmp-resource-headers) diff --git a/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload.h b/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload.h new file mode 100644 index 0000000000000..a570836918acb --- /dev/null +++ b/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload.h @@ -0,0 +1,31 @@ +/*===------ LLVM/Offload helpers for kernel languages (CUDA/HIP) -*- c++ -*-=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#include <stdlib.h> + +#define __host__ __attribute__((host)) +#define __device__ __attribute__((device)) +#define __global__ __attribute__((global)) +#define __shared__ __attribute__((shared)) +#define __constant__ __attribute__((constant)) +#define __managed__ __attribute__((managed)) + +extern "C" { + +typedef struct dim3 { + dim3() {} + dim3(unsigned x) : x(x) {} + unsigned x = 0, y = 0, z = 0; +} dim3; + +// TODO: For some reason the CUDA device compilation requires this declaration +// to be present on the device while it is only used on the host. +unsigned __llvmPushCallConfiguration(dim3 gridDim, dim3 blockDim, + size_t sharedMem = 0, void *stream = 0); +} diff --git a/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload_device.h b/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload_device.h new file mode 100644 index 0000000000000..1a813b331515b --- /dev/null +++ b/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload_device.h @@ -0,0 +1,10 @@ +/*===------ LLVM/Offload helpers for kernel languages (CUDA/HIP) -*- c++ -*-=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#include "__llvm_offload.h" diff --git a/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload_host.h b/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload_host.h new file mode 100644 index 0000000000000..160289d169b55 --- /dev/null +++ b/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload_host.h @@ -0,0 +1,15 @@ +/*===------ LLVM/Offload helpers for kernel languages (CUDA/HIP) -*- c++ -*-=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#include "__llvm_offload.h" + +extern "C" { +unsigned llvmLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, + void **args, size_t sharedMem = 0, void *stream = 0); +} diff --git a/clang/lib/Headers/openmp_wrappers/__clang_openmp_device_functions.h b/clang/lib/Headers/openmp_wrappers/__clang_openmp_device_functions.h index d5b6846b03488..3e354c63efc66 100644 --- a/clang/lib/Headers/openmp_wrappers/__clang_openmp_device_functions.h +++ b/clang/lib/Headers/openmp_wrappers/__clang_openmp_device_functions.h @@ -10,17 +10,15 @@ #ifndef __CLANG_OPENMP_DEVICE_FUNCTIONS_H__ #define __CLANG_OPENMP_DEVICE_FUNCTIONS_H__ -#ifndef _OPENMP -#error "This file is for OpenMP compilation only." -#endif - #ifdef __cplusplus extern "C" { #endif +#ifdef __NVPTX__ #pragma omp begin declare variant match( \ device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)}) +#pragma push_macro("__CUDA__") #define __CUDA__ #define __OPENMP_NVPTX__ @@ -31,9 +29,10 @@ extern "C" { #include <__clang_cuda_device_functions.h> #undef __OPENMP_NVPTX__ -#undef __CUDA__ +#pragma pop_macro("__CUDA__") #pragma omp end declare variant +#endif #ifdef __AMDGCN__ #pragma omp begin declare variant match(device = {arch(amdgcn)}) diff --git a/clang/lib/Sema/SemaCUDA.cpp b/clang/lib/Sema/SemaCUDA.cpp index 80ea43dc5316e..b507e19556363 100644 --- a/clang/lib/Sema/SemaCUDA.cpp +++ b/clang/lib/Sema/SemaCUDA.cpp @@ -1059,6 +1059,9 @@ void SemaCUDA::inheritTargetAttrs(FunctionDecl *FD, } std::string SemaCUDA::getConfigureFuncName() const { + if (getLangOpts().OffloadViaLLVM) + return "__llvmPushCallConfiguration"; + if (getLangOpts().HIP) return getLangOpts().HIPUseNewLaunchAPI ? "__hipPushCallConfiguration" : "hipConfigureCall"; diff --git a/clang/test/CodeGenCUDA/offload_via_llvm.cu b/clang/test/CodeGenCUDA/offload_via_llvm.cu new file mode 100644 index 0000000000000..3eb580850fc48 --- /dev/null +++ b/clang/test/CodeGenCUDA/offload_via_llvm.cu @@ -0,0 +1,97 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// RUN: %clang -Xclang -triple -Xclang "x86_64-unknown-linux-gnu" -S -c -foffload-via-llvm -emit-llvm -o - %s | FileCheck %s + +// Check that we generate LLVM/Offload calls, including the KERNEL_LAUNCH_PARAMS argument. + +// CHECK-LABEL: define dso_local void @_Z18__device_stub__fooisPvS_( +// CHECK-SAME: i32 noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], ptr noundef [[TMP2:%.*]], ptr noundef [[TMP3:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[DOTADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[DOTADDR3:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[TMP0]], align 16 +// CHECK-NEXT: [[KERNEL_LAUNCH_PARAMS:%.*]] = alloca [[TMP1]], align 16 +// CHECK-NEXT: [[GRID_DIM:%.*]] = alloca [[STRUCT_DIM3:%.*]], align 8 +// CHECK-NEXT: [[BLOCK_DIM:%.*]] = alloca [[STRUCT_DIM3]], align 8 +// CHECK-NEXT: [[SHMEM_SIZE:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[STREAM:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[GRID_DIM_COERCE:%.*]] = alloca { i64, i32 }, align 8 +// CHECK-NEXT: [[BLOCK_DIM_COERCE:%.*]] = alloca { i64, i32 }, align 8 +// CHECK-NEXT: store i32 [[TMP0]], ptr [[DOTADDR]], align 4 +// CHECK-NEXT: store i16 [[TMP1]], ptr [[DOTADDR1]], align 2 +// CHECK-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2]], align 8 +// CHECK-NEXT: store ptr [[TMP3]], ptr [[DOTADDR3]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[TMP1]], ptr [[KERNEL_LAUNCH_PARAMS]], i32 0, i32 0 +// CHECK-NEXT: store i64 24, ptr [[TMP4]], align 16 +// CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[TMP1]], ptr [[KERNEL_LAUNCH_PARAMS]], i32 0, i32 1 +// CHECK-NEXT: store ptr [[KERNEL_ARGS]], ptr [[TMP5]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[TMP1]], ptr [[KERNEL_LAUNCH_PARAMS]], i32 0, i32 2 +// CHECK-NEXT: store ptr null, ptr [[TMP6]], align 16 +// CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTADDR]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[TMP0]], ptr [[KERNEL_ARGS]], i32 0, i32 0 +// CHECK-NEXT: store i32 [[TMP7]], ptr [[TMP8]], align 16 +// CHECK-NEXT: [[TMP9:%.*]] = load i16, ptr [[DOTADDR1]], align 2 +// CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[TMP0]], ptr [[KERNEL_ARGS]], i32 0, i32 1 +// CHECK-NEXT: store i16 [[TMP9]], ptr [[TMP10]], align 4 +// CHECK-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTADDR2]], align 8 +// CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[TMP0]], ptr [[KERNEL_ARGS]], i32 0, i32 2 +// CHECK-NEXT: store ptr [[TMP11]], ptr [[TMP12]], align 8 +// CHECK-NEXT: [[TMP13:%.*]] = load ptr, ptr [[DOTADDR3]], align 8 +// CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[TMP0]], ptr [[KERNEL_ARGS]], i32 0, i32 3 +// CHECK-NEXT: store ptr [[TMP13]], ptr [[TMP14]], align 16 +// CHECK-NEXT: [[TMP15:%.*]] = call i32 @__llvmPopCallConfiguration(ptr [[GRID_DIM]], ptr [[BLOCK_DIM]], ptr [[SHMEM_SIZE]], ptr [[STREAM]]) +// CHECK-NEXT: [[TMP16:%.*]] = load i64, ptr [[SHMEM_SIZE]], align 8 +// CHECK-NEXT: [[TMP17:%.*]] = load ptr, ptr [[STREAM]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[GRID_DIM_COERCE]], ptr align 8 [[GRID_DIM]], i64 12, i1 false) +// CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds { i64, i32 }, ptr [[GRID_DIM_COERCE]], i32 0, i32 0 +// CHECK-NEXT: [[TMP19:%.*]] = load i64, ptr [[TMP18]], align 8 +// CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds { i64, i32 }, ptr [[GRID_DIM_COERCE]], i32 0, i32 1 +// CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[BLOCK_DIM_COERCE]], ptr align 8 [[BLOCK_DIM]], i64 12, i1 false) +// CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds { i64, i32 }, ptr [[BLOCK_DIM_COERCE]], i32 0, i32 0 +// CHECK-NEXT: [[TMP23:%.*]] = load i64, ptr [[TMP22]], align 8 +// CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds { i64, i32 }, ptr [[BLOCK_DIM_COERCE]], i32 0, i32 1 +// CHECK-NEXT: [[TMP25:%.*]] = load i32, ptr [[TMP24]], align 8 +// CHECK-NEXT: [[CALL:%.*]] = call noundef i32 @llvmLaunchKernel(ptr noundef @_Z18__device_stub__fooisPvS_, i64 [[TMP19]], i32 [[TMP21]], i64 [[TMP23]], i32 [[TMP25]], ptr noundef [[KERNEL_LAUNCH_PARAMS]], i64 noundef [[TMP16]], ptr noundef [[TMP17]]) +// CHECK-NEXT: br label %[[SETUP_END:.*]] +// CHECK: [[SETUP_END]]: +// CHECK-NEXT: ret void +// +__global__ void foo(int, short, void *, void *) {} + +// CHECK-LABEL: define dso_local void @_Z5test1Pv( +// CHECK-SAME: ptr noundef [[PTR:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[AGG_TMP:%.*]] = alloca [[STRUCT_DIM3:%.*]], align 4 +// CHECK-NEXT: [[AGG_TMP1:%.*]] = alloca [[STRUCT_DIM3]], align 4 +// CHECK-NEXT: [[AGG_TMP_COERCE:%.*]] = alloca { i64, i32 }, align 4 +// CHECK-NEXT: [[AGG_TMP1_COERCE:%.*]] = alloca { i64, i32 }, align 4 +// CHECK-NEXT: store ptr [[PTR]], ptr [[PTR_ADDR]], align 8 +// CHECK-NEXT: call void @_ZN4dim3C2Ej(ptr noundef nonnull align 4 dereferenceable(12) [[AGG_TMP]], i32 noundef 3) +// CHECK-NEXT: call void @_ZN4dim3C2Ej(ptr noundef nonnull align 4 dereferenceable(12) [[AGG_TMP1]], i32 noundef 7) +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[AGG_TMP_COERCE]], ptr align 4 [[AGG_TMP]], i64 12, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds { i64, i32 }, ptr [[AGG_TMP_COERCE]], i32 0, i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr [[TMP0]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds { i64, i32 }, ptr [[AGG_TMP_COERCE]], i32 0, i32 1 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[AGG_TMP1_COERCE]], ptr align 4 [[AGG_TMP1]], i64 12, i1 false) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds { i64, i32 }, ptr [[AGG_TMP1_COERCE]], i32 0, i32 0 +// CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[TMP4]], align 4 +// CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds { i64, i32 }, ptr [[AGG_TMP1_COERCE]], i32 0, i32 1 +// CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4 +// CHECK-NEXT: [[CALL:%.*]] = call i32 @__llvmPushCallConfiguration(i64 [[TMP1]], i32 [[TMP3]], i64 [[TMP5]], i32 [[TMP7]], i64 noundef 0, ptr noundef null) +// CHECK-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[CALL]], 0 +// CHECK-NEXT: br i1 [[TOBOOL]], label %[[KCALL_END:.*]], label %[[KCALL_CONFIGOK:.*]] +// CHECK: [[KCALL_CONFIGOK]]: +// CHECK-NEXT: [[TMP8:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 +// CHECK-NEXT: [[TMP9:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 +// CHECK-NEXT: call void @_Z18__device_stub__fooisPvS_(i32 noundef 13, i16 noundef signext 1, ptr noundef [[TMP8]], ptr noundef [[TMP9]]) #[[ATTR5:[0-9]+]] +// CHECK-NEXT: br label %[[KCALL_END]] +// CHECK: [[KCALL_END]]: +// CHECK-NEXT: ret void +// +void test1(void *Ptr) { + foo<<<3, 7>>>(13, 1, Ptr, Ptr); +} diff --git a/clang/test/Driver/cuda-via-liboffload.cu b/clang/test/Driver/cuda-via-liboffload.cu new file mode 100644 index 0000000000000..68dc963e906b2 --- /dev/null +++ b/clang/test/Driver/cuda-via-liboffload.cu @@ -0,0 +1,23 @@ +// RUN: %clang -### -target x86_64-linux-gnu -foffload-via-llvm -ccc-print-bindings \ +// RUN: --offload-arch=sm_35 --offload-arch=sm_70 %s 2>&1 \ +// RUN: | FileCheck -check-prefix BINDINGS %s + +// BINDINGS: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[HOST_BC:.+]]" +// BINDINGS-NEXT: "nvptx64-nvidia-cuda" - "clang", inputs: ["[[INPUT]]", "[[HOST_BC]]"], output: "[[PTX_SM_35:.+]]" +// BINDINGS-NEXT: "nvptx64-nvidia-cuda" - "NVPTX::Assembler", inputs: ["[[PTX_SM_35]]"], output: "[[CUBIN_SM_35:.+]]" +// BINDINGS-NEXT: "nvptx64-nvidia-cuda" - "clang", inputs: ["[[INPUT]]", "[[HOST_BC]]"], output: "[[PTX_SM_70:.+]]" +// BINDINGS-NEXT: "nvptx64-nvidia-cuda" - "NVPTX::Assembler", inputs: ["[[PTX_SM_70:.+]]"], output: "[[CUBIN_SM_70:.+]]" +// BINDINGS-NEXT: "x86_64-unknown-linux-gnu" - "Offload::Packager", inputs: ["[[CUBIN_SM_35]]", "[[CUBIN_SM_70]]"], output: "[[BINARY:.+]]" +// BINDINGS-NEXT: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[HOST_BC]]", "[[BINARY]]"], output: "[[HOST_OBJ:.+]]" +// BINDINGS-NEXT: "x86_64-unknown-linux-gnu" - "Offload::Linker", inputs: ["[[HOST_OBJ]]"], output: "a.out" + +// RUN: %clang -### -target x86_64-linux-gnu -foffload-via-llvm -ccc-print-bindings \ +// RUN: --offload-arch=sm_35 --offload-arch=sm_70 %s 2>&1 \ +// RUN: | FileCheck -check-prefix BINDINGS-DEVICE %s + +// BINDINGS-DEVICE: # "nvptx64-nvidia-cuda" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[PTX:.+]]" +// BINDINGS-DEVICE: # "nvptx64-nvidia-cuda" - "NVPTX::Assembler", inputs: ["[[PTX]]"], output: "[[CUBIN:.+]]" + +// RUN: %clang -### -target x86_64-linux-gnu -ccc-print-bindings --offload-link -foffload-via-llvm %s 2>&1 | FileCheck -check-prefix DEVICE-LINK %s + +// DEVICE-LINK: "x86_64-unknown-linux-gnu" - "Offload::Linker", inputs: ["[[INPUT:.+]]"], output: "a.out" diff --git a/offload/include/Shared/APITypes.h b/offload/include/Shared/APITypes.h index 1dd69baa7b578..6bea7df0a937f 100644 --- a/offload/include/Shared/APITypes.h +++ b/offload/include/Shared/APITypes.h @@ -102,8 +102,9 @@ struct KernelArgsTy { 0; // Tripcount for the teams / distribute loop, 0 otherwise. struct { uint64_t NoWait : 1; // Was this kernel spawned with a `nowait` clause. - uint64_t Unused : 63; - } Flags = {0, 0}; + uint64_t IsCUDA : 1; // Was this kernel spawned via CUDA. + uint64_t Unused : 62; + } Flags = {0, 0, 0}; uint32_t NumTeams[3] = {0, 0, 0}; // The number of teams (for x,y,z dimension). uint32_t ThreadLimit[3] = {0, 0, diff --git a/offload/include/omptarget.h b/offload/include/omptarget.h index 323dee41630f2..2b6445e9fbe55 100644 --- a/offload/include/omptarget.h +++ b/offload/include/omptarget.h @@ -107,7 +107,7 @@ enum TargetAllocTy : int32_t { inline KernelArgsTy CTorDTorKernelArgs = {1, 0, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, - 0, {0,0}, {1, 0, 0}, {1, 0, 0}, 0}; + 0, {0,0,0}, {1, 0, 0}, {1, 0, 0}, 0}; struct DeviceTy; diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp index 00e12aecf7512..5df54e9155ae1 100644 --- a/offload/plugins-nextgen/common/src/PluginInterface.cpp +++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp @@ -549,9 +549,16 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs, if (!KernelLaunchEnvOrErr) return KernelLaunchEnvOrErr.takeError(); - KernelLaunchParamsTy LaunchParams = - prepareArgs(GenericDevice, ArgPtrs, ArgOffsets, KernelArgs.NumArgs, Args, - Ptrs, *KernelLaunchEnvOrErr); + KernelLaunchParamsTy LaunchParams; + + // Kernel languages don't use indirection. + if (KernelArgs.Flags.IsCUDA) { + LaunchParams = *reinterpret_cast<KernelLaunchParamsTy *>(KernelArgs.ArgPtrs); + } else { + LaunchParams = + prepareArgs(GenericDevice, ArgPtrs, ArgOffsets, KernelArgs.NumArgs, + Args, Ptrs, *KernelLaunchEnvOrErr); + } uint32_t NumThreads = getNumThreads(GenericDevice, KernelArgs.ThreadLimit); uint64_t NumBlocks = diff --git a/offload/src/CMakeLists.txt b/offload/src/CMakeLists.txt index efa5cdab33ec9..b442df45deaa5 100644 --- a/offload/src/CMakeLists.txt +++ b/offload/src/CMakeLists.txt @@ -22,6 +22,7 @@ add_llvm_library(omptarget OpenMP/InteropAPI.cpp OpenMP/OMPT/Callback.cpp + KernelLanguage/API.cpp ADDITIONAL_HEADER_DIRS ${LIBOMPTARGET_INCLUDE_DIR} diff --git a/offload/src/KernelLanguage/API.cpp b/offload/src/KernelLanguage/API.cpp new file mode 100644 index 0000000000000..9ffc199b5da7d --- /dev/null +++ b/offload/src/KernelLanguage/API.cpp @@ -0,0 +1,76 @@ +//===------ API.cpp - Kernel Language (CUDA/HIP) entry points ----- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + + +#include "Shared/APITypes.h" + +#include <cstdio> + +struct dim3 { + unsigned x = 0, y = 0, z = 0; +}; + +struct __omp_kernel_t { + dim3 __grid_size; + dim3 __block_size; + size_t __shared_memory; + + void *__stream; +}; + +static __omp_kernel_t __current_kernel = {}; +#pragma omp threadprivate(__current_kernel); + +extern "C" { + +// TODO: There is little reason we need to keep these names or the way calls are +// issued. For now we do to avoid modifying Clang's CUDA codegen. Unclear when +// we actually need to push/pop configurations. +unsigned __llvmPushCallConfiguration(dim3 __grid_size, dim3 __block_size, + size_t __shared_memory, void *__stream) { + __omp_kernel_t &__kernel = __current_kernel; + __kernel.__grid_size = __grid_size; + __kernel.__block_size = __block_size; + __kernel.__shared_memory = __shared_memory; + __kernel.__stream = __stream; + return 0; +} + +unsigned __llvmPopCallConfiguration(dim3 *__grid_size, dim3 *__block_size, + size_t *__shared_memory, void *__stream) { + __omp_kernel_t &__kernel = __current_kernel; + *__grid_size = __kernel.__grid_size; + *__block_size = __kernel.__block_size; + *__shared_memory = __kernel.__shared_memory; + *((void **)__stream) = __kernel.__stream; + return 0; +} + +int __tgt_target_kernel(void *Loc, int64_t DeviceId, int32_t NumTeams, + int32_t ThreadLimit, const void *HostPtr, + KernelArgsTy *Args); + +unsigned llvmLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, + void *args, size_t sharedMem, void *stream) { + KernelArgsTy Args = {}; + Args.DynCGroupMem = sharedMem; + Args.NumTeams[0] = gridDim.x; + Args.NumTeams[1] = gridDim.y; + Args.NumTeams[2] = gridDim.z; + Args.ThreadLimit[0] = blockDim.x; + Args.ThreadLimit[1] = blockDim.y; + Args.ThreadLimit[2] = blockDim.z; + Args.ArgPtrs = reinterpret_cast<void **>(args); + Args.Flags.IsCUDA = true; + int rv = __tgt_target_kernel(nullptr, 0, gridDim.x, + blockDim.x, func, &Args); + return rv; +} +} diff --git a/offload/src/exports b/offload/src/exports index f95544ec8329c..7bdc7d2a531bb 100644 --- a/offload/src/exports +++ b/offload/src/exports @@ -71,6 +71,9 @@ VERS1.0 { __tgt_interop_use; __tgt_interop_destroy; ompt_libomptarget_connect; + __llvmPushCallConfiguration; + __llvmPopCallConfiguration; + llvmLaunchKernel; local: *; }; diff --git a/offload/test/lit.cfg b/offload/test/lit.cfg index 6c590603079c4..9053151e44a78 100644 --- a/offload/test/lit.cfg +++ b/offload/test/lit.cfg @@ -66,7 +66,7 @@ def evaluate_bool_env(env): config.name = 'libomptarget :: ' + config.libomptarget_current_target # suffixes: A list of file extensions to treat as test files. -config.suffixes = ['.c', '.cpp', '.cc', '.f90'] +config.suffixes = ['.c', '.cpp', '.cc', '.f90', '.cu'] # excludes: A list of directories to exclude from the testuites. config.excludes = ['Inputs'] diff --git a/offload/test/offloading/CUDA/basic_launch.cu b/offload/test/offloading/CUDA/basic_launch.cu new file mode 100644 index 0000000000000..2915a7c216ab5 --- /dev/null +++ b/offload/test/offloading/CUDA/basic_launch.cu @@ -0,0 +1,31 @@ +// RUN: %clang++ -foffload-via-llvm --offload-arch=native %s -o %t +// RUN: %t | %fcheck-generic +// RUN: %clang++ -foffload-via-llvm --offload-arch=native %s -o %t -fopenmp +// RUN: %t | %fcheck-generic + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO + + +#include <stdio.h> + +extern "C" { +void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum); +void llvm_omp_target_free_shared(void *DevicePtr, int DeviceNum); +} + +__global__ void square(int *A) { *A = 42; } + +int main(int argc, char **argv) { + int DevNo = 0; + int *Ptr = reinterpret_cast<int *>(llvm_omp_target_alloc_shared(4, DevNo)); + *Ptr = 7; + printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr); + // CHECK: Ptr [[Ptr:0x.*]], *Ptr: 7 + square<<<1, 1>>>(Ptr); + printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr); + // CHECK: Ptr [[Ptr]], *Ptr: 42 + llvm_omp_target_free_shared(Ptr, DevNo); +} diff --git a/offload/test/offloading/CUDA/basic_launch_blocks_and_threads.cu b/offload/test/offloading/CUDA/basic_launch_blocks_and_threads.cu new file mode 100644 index 0000000000000..615cae6f7b233 --- /dev/null +++ b/offload/test/offloading/CUDA/basic_launch_blocks_and_threads.cu @@ -0,0 +1,32 @@ +// RUN: %clang++ -foffload-via-llvm --offload-arch=native %s -o %t +// RUN: %t | %fcheck-generic +// RUN: %clang++ -foffload-via-llvm --offload-arch=native %s -o %t -fopenmp +// RUN: %t | %fcheck-generic + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO + +#include <stdio.h> + +extern "C" { +void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum); +void llvm_omp_target_free_shared(void *DevicePtr, int DeviceNum); +} + +__global__ void square(int *A) { + __scoped_atomic_fetch_add(A, 1, __ATOMIC_SEQ_CST, __MEMORY_SCOPE_DEVICE); +} + +int main(int argc, char **argv) { + int DevNo = 0; + int *Ptr = reinterpret_cast<int *>(llvm_omp_target_alloc_shared(4, DevNo)); + *Ptr = 0; + printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr); + // CHECK: Ptr [[Ptr:0x.*]], *Ptr: 0 + square<<<7, 6>>>(Ptr); + printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr); + // CHECK: Ptr [[Ptr]], *Ptr: 42 + llvm_omp_target_free_shared(Ptr, DevNo); +} diff --git a/offload/test/offloading/CUDA/basic_launch_multi_arg.cu b/offload/test/offloading/CUDA/basic_launch_multi_arg.cu new file mode 100644 index 0000000000000..f95f1dbacc79c --- /dev/null +++ b/offload/test/offloading/CUDA/basic_launch_multi_arg.cu @@ -0,0 +1,41 @@ +// RUN: %clang++ -foffload-via-llvm --offload-arch=native %s -o %t +// RUN: %t | %fcheck-generic +// RUN: %clang++ -foffload-via-llvm --offload-arch=native %s -o %t -fopenmp +// RUN: %t | %fcheck-generic + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO + +#include <stdio.h> + +extern "C" { +void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum); +void llvm_omp_target_free_shared(void *DevicePtr, int DeviceNum); +} + +__global__ void square(int *Dst, short Q, int *Src, short P) { + *Dst = (Src[0] + Src[1]) * (Q + P); + Src[0] = Q; + Src[1] = P; +} + +int main(int argc, char **argv) { + int DevNo = 0; + int *Ptr = reinterpret_cast<int *>(llvm_omp_target_alloc_shared(4, DevNo)); + int *Src = reinterpret_cast<int *>(llvm_omp_target_alloc_shared(8, DevNo)); + *Ptr = 7; + Src[0] = -2; + Src[1] = 8; + printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr); + // CHECK: Ptr [[Ptr:0x.*]], *Ptr: 7 + printf("Src: %i : %i\n", Src[0], Src[1]); + // CHECK: Src: -2 : 8 + square<<<1, 1>>>(Ptr, 3, Src, 4); + printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr); + // CHECK: Ptr [[Ptr]], *Ptr: 42 + printf("Src: %i : %i\n", Src[0], Src[1]); + // CHECK: Src: 3 : 4 + llvm_omp_target_free_shared(Ptr, DevNo); +} diff --git a/offload/test/offloading/CUDA/kernel_tu.cu.inc b/offload/test/offloading/CUDA/kernel_tu.cu.inc new file mode 100644 index 0000000000000..d7d28a109dfc5 --- /dev/null +++ b/offload/test/offloading/CUDA/kernel_tu.cu.inc @@ -0,0 +1 @@ +__global__ void square(int *A) { *A = 42; } diff --git a/offload/test/offloading/CUDA/launch_tu.cu b/offload/test/offloading/CUDA/launch_tu.cu new file mode 100644 index 0000000000000..1aaf106fadee4 --- /dev/null +++ b/offload/test/offloading/CUDA/launch_tu.cu @@ -0,0 +1,32 @@ +// clang-format off +// RUN: %clang++ -foffload-via-llvm --offload-arch=native %s -o %t.launch_tu.o -c +// RUN: %clang++ -foffload-via-llvm --offload-arch=native %S/kernel_tu.cu.inc -x cuda -o %t.kernel_tu.o -c +// RUN: %clang++ -foffload-via-llvm --offload-arch=native %t.launch_tu.o %t.kernel_tu.o -o %t +// RUN: %t | %fcheck-generic +// clang-format on + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO + +#include <stdio.h> + +extern "C" { +void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum); +void llvm_omp_target_free_shared(void *DevicePtr, int DeviceNum); +} + +extern __global__ void square(int *A); + +int main(int argc, char **argv) { + int DevNo = 0; + int *Ptr = reinterpret_cast<int *>(llvm_omp_target_alloc_shared(4, DevNo)); + *Ptr = 7; + printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr); + // CHECK: Ptr [[Ptr:0x.*]], *Ptr: 7 + square<<<1, 1>>>(Ptr); + printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr); + // CHECK: Ptr [[Ptr]], *Ptr: 42 + llvm_omp_target_free_shared(Ptr, DevNo); +} _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits