[clang] [llvm] [Offload] Introduce the concept of "default streams" (PR #95371)

2024-06-15 Thread Johannes Doerfert via cfe-commits

https://github.com/jdoerfert updated 
https://github.com/llvm/llvm-project/pull/95371

>From d06585044bd6d2dd76d6110bce933e01fd4b333e Mon Sep 17 00:00:00 2001
From: Johannes Doerfert 
Date: Mon, 3 Jun 2024 19:52:12 -0700
Subject: [PATCH 1/3] [Offload][CUDA] Allow CUDA kernels to use LLVM/Offload
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Through the new `-foffload-via-llvm` flag, CUDA kernels can now be
lowered to the LLVM/Offload API. On the Clang side, this is simply done
by using the OpenMP offload toolchain and emitting calls to `llvm*`
functions to orchestrate the kernel launch rather than `cuda*`
functions. These `llvm*` functions are implemented on top of the
existing LLVM/Offload API.

As we are about to redefine the Offload API, this wil help us in the
design process as a second offload language.

We do not support any CUDA APIs yet, however, we could:
  https://www.osti.gov/servlets/purl/1892137

For proper host execution we need to resurrect/rebase
  https://tianshilei.me/wp-content/uploads/2021/12/llpp-2021.pdf
(which was designed for debugging).

```
❯❯❯ cat test.cu
extern "C" {
void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum);
void llvm_omp_target_free_shared(void *DevicePtr, int DeviceNum);
}

__global__ void square(int *A) { *A = 42; }

int main(int argc, char **argv) {
  int DevNo = 0;
  int *Ptr = reinterpret_cast(llvm_omp_target_alloc_shared(4, DevNo));
  *Ptr = 7;
  printf("Ptr %p, *Ptr %i\n", Ptr, *Ptr);
  square<<<1, 1>>>(Ptr);
  printf("Ptr %p, *Ptr %i\n", Ptr, *Ptr);
  llvm_omp_target_free_shared(Ptr, DevNo);
}

❯❯❯ clang++ test.cu -O3 -o test123 -foffload-via-llvm --offload-arch=native

❯❯❯ llvm-objdump --offloading test123

test123:file format elf64-x86-64

OFFLOADING IMAGE [0]:
kindelf
archgfx90a
triple  amdgcn-amd-amdhsa
produceropenmp

❯❯❯ LIBOMPTARGET_INFO=16 ./test123
Ptr 0x155448ac8000, *Ptr 7
Ptr 0x155448ac8000, *Ptr 42
```
---
 clang/include/clang/Basic/LangOptions.def |  1 +
 clang/include/clang/Driver/Options.td |  6 ++
 clang/lib/CodeGen/CGCUDANV.cpp| 97 ---
 clang/lib/Driver/Driver.cpp   | 19 ++--
 clang/lib/Driver/ToolChains/Clang.cpp | 27 +-
 clang/lib/Driver/ToolChains/CommonArgs.cpp|  7 +-
 clang/lib/Driver/ToolChains/Cuda.cpp  | 27 +++---
 clang/lib/Headers/CMakeLists.txt  | 18 +++-
 .../llvm_offload_wrappers/__llvm_offload.h| 31 ++
 .../__llvm_offload_device.h   | 10 ++
 .../__llvm_offload_host.h | 15 +++
 .../__clang_openmp_device_functions.h |  9 +-
 clang/lib/Sema/SemaCUDA.cpp   |  3 +
 clang/test/CodeGenCUDA/offload_via_llvm.cu| 97 +++
 clang/test/Driver/cuda-via-liboffload.cu  | 23 +
 offload/include/Shared/APITypes.h |  5 +-
 offload/include/omptarget.h   |  2 +-
 .../common/src/PluginInterface.cpp| 13 ++-
 offload/src/CMakeLists.txt|  1 +
 offload/src/KernelLanguage/API.cpp| 76 +++
 offload/src/exports   |  3 +
 offload/test/lit.cfg  |  2 +-
 offload/test/offloading/CUDA/basic_launch.cu  | 31 ++
 .../CUDA/basic_launch_blocks_and_threads.cu   | 32 ++
 .../offloading/CUDA/basic_launch_multi_arg.cu | 41 
 offload/test/offloading/CUDA/kernel_tu.cu.inc |  1 +
 offload/test/offloading/CUDA/launch_tu.cu | 32 ++
 27 files changed, 576 insertions(+), 53 deletions(-)
 create mode 100644 clang/lib/Headers/llvm_offload_wrappers/__llvm_offload.h
 create mode 100644 
clang/lib/Headers/llvm_offload_wrappers/__llvm_offload_device.h
 create mode 100644 
clang/lib/Headers/llvm_offload_wrappers/__llvm_offload_host.h
 create mode 100644 clang/test/CodeGenCUDA/offload_via_llvm.cu
 create mode 100644 clang/test/Driver/cuda-via-liboffload.cu
 create mode 100644 offload/src/KernelLanguage/API.cpp
 create mode 100644 offload/test/offloading/CUDA/basic_launch.cu
 create mode 100644 
offload/test/offloading/CUDA/basic_launch_blocks_and_threads.cu
 create mode 100644 offload/test/offloading/CUDA/basic_launch_multi_arg.cu
 create mode 100644 offload/test/offloading/CUDA/kernel_tu.cu.inc
 create mode 100644 offload/test/offloading/CUDA/launch_tu.cu

diff --git a/clang/include/clang/Basic/LangOptions.def 
b/clang/include/clang/Basic/LangOptions.def
index 2dea3cd4d795b..e8d3be7e89dbb 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -288,6 +288,7 @@ LANGOPT(GPUMaxThreadsPerBlock, 32, 1024, "default max 
threads per block for kern
 LANGOPT(GPUDeferDiag, 1, 0, "defer host/device related diagnostic messages for 
CUDA/HIP")
 LANGOPT(GPUExcludeWrongSideOverloads, 1, 0, "always exclude wrong side 
overloads in overloading resolution for CUDA/HIP")
 LANGOPT(OffloadingNewDriver, 1, 0, "use 

[clang] [llvm] [Offload] Introduce the concept of "default streams" (PR #95371)

2024-06-13 Thread Johannes Doerfert via cfe-commits

https://github.com/jdoerfert updated 
https://github.com/llvm/llvm-project/pull/95371

>From d06585044bd6d2dd76d6110bce933e01fd4b333e Mon Sep 17 00:00:00 2001
From: Johannes Doerfert 
Date: Mon, 3 Jun 2024 19:52:12 -0700
Subject: [PATCH 1/3] [Offload][CUDA] Allow CUDA kernels to use LLVM/Offload
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Through the new `-foffload-via-llvm` flag, CUDA kernels can now be
lowered to the LLVM/Offload API. On the Clang side, this is simply done
by using the OpenMP offload toolchain and emitting calls to `llvm*`
functions to orchestrate the kernel launch rather than `cuda*`
functions. These `llvm*` functions are implemented on top of the
existing LLVM/Offload API.

As we are about to redefine the Offload API, this wil help us in the
design process as a second offload language.

We do not support any CUDA APIs yet, however, we could:
  https://www.osti.gov/servlets/purl/1892137

For proper host execution we need to resurrect/rebase
  https://tianshilei.me/wp-content/uploads/2021/12/llpp-2021.pdf
(which was designed for debugging).

```
❯❯❯ cat test.cu
extern "C" {
void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum);
void llvm_omp_target_free_shared(void *DevicePtr, int DeviceNum);
}

__global__ void square(int *A) { *A = 42; }

int main(int argc, char **argv) {
  int DevNo = 0;
  int *Ptr = reinterpret_cast(llvm_omp_target_alloc_shared(4, DevNo));
  *Ptr = 7;
  printf("Ptr %p, *Ptr %i\n", Ptr, *Ptr);
  square<<<1, 1>>>(Ptr);
  printf("Ptr %p, *Ptr %i\n", Ptr, *Ptr);
  llvm_omp_target_free_shared(Ptr, DevNo);
}

❯❯❯ clang++ test.cu -O3 -o test123 -foffload-via-llvm --offload-arch=native

❯❯❯ llvm-objdump --offloading test123

test123:file format elf64-x86-64

OFFLOADING IMAGE [0]:
kindelf
archgfx90a
triple  amdgcn-amd-amdhsa
produceropenmp

❯❯❯ LIBOMPTARGET_INFO=16 ./test123
Ptr 0x155448ac8000, *Ptr 7
Ptr 0x155448ac8000, *Ptr 42
```
---
 clang/include/clang/Basic/LangOptions.def |  1 +
 clang/include/clang/Driver/Options.td |  6 ++
 clang/lib/CodeGen/CGCUDANV.cpp| 97 ---
 clang/lib/Driver/Driver.cpp   | 19 ++--
 clang/lib/Driver/ToolChains/Clang.cpp | 27 +-
 clang/lib/Driver/ToolChains/CommonArgs.cpp|  7 +-
 clang/lib/Driver/ToolChains/Cuda.cpp  | 27 +++---
 clang/lib/Headers/CMakeLists.txt  | 18 +++-
 .../llvm_offload_wrappers/__llvm_offload.h| 31 ++
 .../__llvm_offload_device.h   | 10 ++
 .../__llvm_offload_host.h | 15 +++
 .../__clang_openmp_device_functions.h |  9 +-
 clang/lib/Sema/SemaCUDA.cpp   |  3 +
 clang/test/CodeGenCUDA/offload_via_llvm.cu| 97 +++
 clang/test/Driver/cuda-via-liboffload.cu  | 23 +
 offload/include/Shared/APITypes.h |  5 +-
 offload/include/omptarget.h   |  2 +-
 .../common/src/PluginInterface.cpp| 13 ++-
 offload/src/CMakeLists.txt|  1 +
 offload/src/KernelLanguage/API.cpp| 76 +++
 offload/src/exports   |  3 +
 offload/test/lit.cfg  |  2 +-
 offload/test/offloading/CUDA/basic_launch.cu  | 31 ++
 .../CUDA/basic_launch_blocks_and_threads.cu   | 32 ++
 .../offloading/CUDA/basic_launch_multi_arg.cu | 41 
 offload/test/offloading/CUDA/kernel_tu.cu.inc |  1 +
 offload/test/offloading/CUDA/launch_tu.cu | 32 ++
 27 files changed, 576 insertions(+), 53 deletions(-)
 create mode 100644 clang/lib/Headers/llvm_offload_wrappers/__llvm_offload.h
 create mode 100644 
clang/lib/Headers/llvm_offload_wrappers/__llvm_offload_device.h
 create mode 100644 
clang/lib/Headers/llvm_offload_wrappers/__llvm_offload_host.h
 create mode 100644 clang/test/CodeGenCUDA/offload_via_llvm.cu
 create mode 100644 clang/test/Driver/cuda-via-liboffload.cu
 create mode 100644 offload/src/KernelLanguage/API.cpp
 create mode 100644 offload/test/offloading/CUDA/basic_launch.cu
 create mode 100644 
offload/test/offloading/CUDA/basic_launch_blocks_and_threads.cu
 create mode 100644 offload/test/offloading/CUDA/basic_launch_multi_arg.cu
 create mode 100644 offload/test/offloading/CUDA/kernel_tu.cu.inc
 create mode 100644 offload/test/offloading/CUDA/launch_tu.cu

diff --git a/clang/include/clang/Basic/LangOptions.def 
b/clang/include/clang/Basic/LangOptions.def
index 2dea3cd4d795b..e8d3be7e89dbb 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -288,6 +288,7 @@ LANGOPT(GPUMaxThreadsPerBlock, 32, 1024, "default max 
threads per block for kern
 LANGOPT(GPUDeferDiag, 1, 0, "defer host/device related diagnostic messages for 
CUDA/HIP")
 LANGOPT(GPUExcludeWrongSideOverloads, 1, 0, "always exclude wrong side 
overloads in overloading resolution for CUDA/HIP")
 LANGOPT(OffloadingNewDriver, 1, 0, "use 

[clang] [llvm] [Offload] Introduce the concept of "default streams" (PR #95371)

2024-06-13 Thread via cfe-commits

github-actions[bot] wrote:




:warning: C/C++ code formatter, clang-format found issues in your code. 
:warning:



You can test this locally with the following command:


``bash
git-clang-format --diff 7adb7aa494247f2492f6207289ad90cb48807517 
705de43498aec79565d6469a00a54e65e988faf8 -- 
clang/lib/Headers/llvm_offload_wrappers/__llvm_offload.h 
clang/lib/Headers/llvm_offload_wrappers/__llvm_offload_device.h 
clang/lib/Headers/llvm_offload_wrappers/__llvm_offload_host.h 
clang/lib/Headers/llvm_offload_wrappers/cuda_runtime.h 
offload/src/KernelLanguage/API.cpp 
offload/test/offloading/CUDA/kernel_tu.cu.inc clang/lib/CodeGen/CGCUDANV.cpp 
clang/lib/Driver/Driver.cpp clang/lib/Driver/ToolChains/Clang.cpp 
clang/lib/Driver/ToolChains/CommonArgs.cpp clang/lib/Driver/ToolChains/Cuda.cpp 
clang/lib/Headers/openmp_wrappers/__clang_openmp_device_functions.h 
clang/lib/Sema/SemaCUDA.cpp llvm/include/llvm/Frontend/OpenMP/OMPConstants.h 
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp offload/include/Shared/APITypes.h 
offload/include/omptarget.h offload/plugins-nextgen/amdgpu/src/rtl.cpp 
offload/plugins-nextgen/common/src/PluginInterface.cpp 
offload/plugins-nextgen/cuda/src/rtl.cpp offload/src/interface.cpp 
offload/src/omptarget.cpp
``





View the diff from clang-format here.


``diff
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp 
b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index ff0f6edfcd..009932ead4 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -553,7 +553,8 @@ Error GenericKernelTy::launch(GenericDeviceTy 
, void **ArgPtrs,
 
   // Kernel languages don't use indirection.
   if (KernelArgs.Flags.IsCUDA) {
-LaunchParams = *reinterpret_cast(KernelArgs.ArgPtrs);
+LaunchParams =
+*reinterpret_cast(KernelArgs.ArgPtrs);
   } else {
 LaunchParams =
 prepareArgs(GenericDevice, ArgPtrs, ArgOffsets, KernelArgs.NumArgs,
diff --git a/offload/src/KernelLanguage/API.cpp 
b/offload/src/KernelLanguage/API.cpp
index 779751deed..b8b1dbb3d6 100644
--- a/offload/src/KernelLanguage/API.cpp
+++ b/offload/src/KernelLanguage/API.cpp
@@ -47,7 +47,7 @@ unsigned __llvmPushCallConfiguration(dim3 __grid_size, dim3 
__block_size,
 
 unsigned __llvmPopCallConfiguration(dim3 *__grid_size, dim3 *__block_size,
 size_t *__shared_memory, void *__stream) {
-   __omp_kernel_t &__kernel = __current_kernel;
+  __omp_kernel_t &__kernel = __current_kernel;
   *__grid_size = __kernel.__grid_size;
   *__block_size = __kernel.__block_size;
   *__shared_memory = __kernel.__shared_memory;

``




https://github.com/llvm/llvm-project/pull/95371
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [Offload] Introduce the concept of "default streams" (PR #95371)

2024-06-13 Thread Johannes Doerfert via cfe-commits

https://github.com/jdoerfert created 
https://github.com/llvm/llvm-project/pull/95371

The offload APIs, and the CUDA wrappers in clang, now support "default
streams" per thread (and per device). It should be per context but we
don't really expose that concept yet. The KernelArguments allow an
LLVM/Offload user to provide a "AsyncInfoQueue", which is plugin
dependent and can later also be created outside or queried from the
runtime. User managed "queues" are kept persistent, thus not returned to
the pool once synchronized.

The CUDA tests will synchronize via `cudaDeviceSynchronize` before
checking the results.

Based on https://github.com/llvm/llvm-project/pull/94821.

>From 36618e65d94ffa3e83464b7d19ff6cd7d5855abf Mon Sep 17 00:00:00 2001
From: Johannes Doerfert 
Date: Wed, 5 Jun 2024 16:51:51 -0700
Subject: [PATCH 1/5] [Offload][NFCI] Initialize the KernelArgsTy to default
 values

---
 offload/include/Shared/APITypes.h | 30 +-
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/offload/include/Shared/APITypes.h 
b/offload/include/Shared/APITypes.h
index e8fc27785b6c2..fd315c6b992b9 100644
--- a/offload/include/Shared/APITypes.h
+++ b/offload/include/Shared/APITypes.h
@@ -89,22 +89,26 @@ struct __tgt_async_info {
 
 /// This struct contains all of the arguments to a target kernel region launch.
 struct KernelArgsTy {
-  uint32_t Version;   // Version of this struct for ABI compatibility.
-  uint32_t NumArgs;   // Number of arguments in each input pointer.
-  void **ArgBasePtrs; // Base pointer of each argument (e.g. a struct).
-  void **ArgPtrs; // Pointer to the argument data.
-  int64_t *ArgSizes;  // Size of the argument data in bytes.
-  int64_t *ArgTypes;  // Type of the data (e.g. to / from).
-  void **ArgNames;// Name of the data for debugging, possibly null.
-  void **ArgMappers;  // User-defined mappers, possibly null.
-  uint64_t Tripcount; // Tripcount for the teams / distribute loop, 0 
otherwise.
+  uint32_t Version = 0; // Version of this struct for ABI compatibility.
+  uint32_t NumArgs = 0; // Number of arguments in each input pointer.
+  void **ArgBasePtrs =
+  nullptr; // Base pointer of each argument (e.g. a 
struct).
+  void **ArgPtrs = nullptr;// Pointer to the argument data.
+  int64_t *ArgSizes = nullptr; // Size of the argument data in bytes.
+  int64_t *ArgTypes = nullptr; // Type of the data (e.g. to / from).
+  void **ArgNames = nullptr;   // Name of the data for debugging, possibly 
null.
+  void **ArgMappers = nullptr; // User-defined mappers, possibly null.
+  uint64_t Tripcount =
+  0; // Tripcount for the teams / distribute loop, 0 otherwise.
   struct {
 uint64_t NoWait : 1; // Was this kernel spawned with a `nowait` clause.
 uint64_t Unused : 63;
-  } Flags;
-  uint32_t NumTeams[3];// The number of teams (for x,y,z dimension).
-  uint32_t ThreadLimit[3]; // The number of threads (for x,y,z dimension).
-  uint32_t DynCGroupMem;   // Amount of dynamic cgroup memory requested.
+  } Flags = {0, 0};
+  uint32_t NumTeams[3] = {0, 0,
+  0}; // The number of teams (for x,y,z dimension).
+  uint32_t ThreadLimit[3] = {0, 0,
+ 0}; // The number of threads (for x,y,z 
dimension).
+  uint32_t DynCGroupMem = 0; // Amount of dynamic cgroup memory requested.
 };
 static_assert(sizeof(KernelArgsTy().Flags) == sizeof(uint64_t),
   "Invalid struct size");

>From 3d5c61a78e91ecb379a2bfac71988eaf8e5cd9cd Mon Sep 17 00:00:00 2001
From: Johannes Doerfert 
Date: Mon, 3 Jun 2024 19:52:12 -0700
Subject: [PATCH 2/5] [Offload][CUDA] Allow CUDA kernels to use LLVM/Offload
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Through the new `-foffload-via-llvm` flag, CUDA kernels can now be
lowered to the LLVM/Offload API. On the Clang side, this is simply done
by using the OpenMP offload toolchain and emitting calls to `llvm*`
functions to orchestrate the kernel launch rather than `cuda*`
functions. These `llvm*` functions are implemented on top of the
existing LLVM/Offload API.

As we are about to redefine the Offload API, this wil help us in the
design process as a second offload language.

We do not support any CUDA APIs yet, however, we could:
  https://www.osti.gov/servlets/purl/1892137

For proper host execution we need to resurrect/rebase
  https://tianshilei.me/wp-content/uploads/2021/12/llpp-2021.pdf
(which was designed for debugging).

```
❯❯❯ cat test.cu
extern "C" {
void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum);
void llvm_omp_target_free_shared(void *DevicePtr, int DeviceNum);
}

__global__ void square(int *A) { *A = 42; }

int main(int argc, char **argv) {
  int DevNo = 0;
  int *Ptr = reinterpret_cast(llvm_omp_target_alloc_shared(4, DevNo));
  *Ptr = 7;
  printf("Ptr %p, *Ptr %i\n", Ptr, *Ptr);
  square<<<1, 1>>>(Ptr);
  printf("Ptr %p, *Ptr %i\n", Ptr, *Ptr);
  

[clang] [llvm] [Offload] Introduce the concept of "default streams" (PR #95371)

2024-06-13 Thread Johannes Doerfert via cfe-commits


@@ -1125,6 +1125,22 @@ void Clang::AddPreprocessingOptions(Compilation , 
const JobAction ,
 CmdArgs.push_back("__clang_openmp_device_functions.h");
   }
 
+  if (Args.hasArg(options::OPT_foffload_via_llvm)) {
+// Add llvm_wrappers/* to our system include path.  This lets us wrap
+// standard library headers and other headers.
+SmallString<128> P(D.ResourceDir);
+llvm::sys::path::append(P, "include");
+llvm::sys::path::append(P, "llvm_offload_wrappers");

jdoerfert wrote:

Sorry, I pushed this first on top of old commits. This was fixed and I updated 
all my PRs.

Since I'm not able to do stacked commits, basically only review the latest one 
in each PR.

https://github.com/llvm/llvm-project/pull/95371
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [Offload] Introduce the concept of "default streams" (PR #95371)

2024-06-13 Thread Matt Arsenault via cfe-commits


@@ -1125,6 +1125,22 @@ void Clang::AddPreprocessingOptions(Compilation , 
const JobAction ,
 CmdArgs.push_back("__clang_openmp_device_functions.h");
   }
 
+  if (Args.hasArg(options::OPT_foffload_via_llvm)) {
+// Add llvm_wrappers/* to our system include path.  This lets us wrap
+// standard library headers and other headers.
+SmallString<128> P(D.ResourceDir);
+llvm::sys::path::append(P, "include");
+llvm::sys::path::append(P, "llvm_offload_wrappers");
+CmdArgs.push_back("-internal-isystem");
+CmdArgs.push_back(Args.MakeArgString(P));
+
+CmdArgs.push_back("-include");
+if (JA.isDeviceOffloading(Action::OFK_OpenMP))
+  CmdArgs.push_back("__llvm_offload_device.h");
+else
+  CmdArgs.push_back("__llvm_offload_host.h");

arsenm wrote:

Push pack select of string name? 

https://github.com/llvm/llvm-project/pull/95371
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [Offload] Introduce the concept of "default streams" (PR #95371)

2024-06-13 Thread Matt Arsenault via cfe-commits


@@ -1125,6 +1125,22 @@ void Clang::AddPreprocessingOptions(Compilation , 
const JobAction ,
 CmdArgs.push_back("__clang_openmp_device_functions.h");
   }
 
+  if (Args.hasArg(options::OPT_foffload_via_llvm)) {
+// Add llvm_wrappers/* to our system include path.  This lets us wrap
+// standard library headers and other headers.
+SmallString<128> P(D.ResourceDir);
+llvm::sys::path::append(P, "include");
+llvm::sys::path::append(P, "llvm_offload_wrappers");

arsenm wrote:

I think append is variadic and you can do both pieces in one call 

https://github.com/llvm/llvm-project/pull/95371
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [Offload] Introduce the concept of "default streams" (PR #95371)

2024-06-13 Thread via cfe-commits

llvmbot wrote:



@llvm/pr-subscribers-offload

@llvm/pr-subscribers-clang-driver

Author: Johannes Doerfert (jdoerfert)


Changes

The offload APIs, and the CUDA wrappers in clang, now support "default
streams" per thread (and per device). It should be per context but we
don't really expose that concept yet. The KernelArguments allow an
LLVM/Offload user to provide a "AsyncInfoQueue", which is plugin
dependent and can later also be created outside or queried from the
runtime. User managed "queues" are kept persistent, thus not returned to
the pool once synchronized.

The CUDA tests will synchronize via `cudaDeviceSynchronize` before
checking the results.

Based on https://github.com/llvm/llvm-project/pull/94821.

---

Patch is 58.22 KiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/95371.diff


34 Files Affected:

- (modified) clang/include/clang/Basic/LangOptions.def (+1) 
- (modified) clang/include/clang/Driver/Options.td (+6) 
- (modified) clang/lib/CodeGen/CGCUDANV.cpp (+62-15) 
- (modified) clang/lib/Driver/Driver.cpp (+12-7) 
- (modified) clang/lib/Driver/ToolChains/Clang.cpp (+26-4) 
- (modified) clang/lib/Driver/ToolChains/CommonArgs.cpp (+6-1) 
- (modified) clang/lib/Headers/CMakeLists.txt (+16-3) 
- (added) clang/lib/Headers/llvm_offload_wrappers/__llvm_offload.h (+31) 
- (added) clang/lib/Headers/llvm_offload_wrappers/__llvm_offload_device.h (+10) 
- (added) clang/lib/Headers/llvm_offload_wrappers/__llvm_offload_host.h (+15) 
- (added) clang/lib/Headers/llvm_offload_wrappers/cuda_runtime.h (+139) 
- (modified) 
clang/lib/Headers/openmp_wrappers/__clang_openmp_device_functions.h (+2-4) 
- (modified) clang/lib/Sema/SemaCUDA.cpp (+3) 
- (added) clang/test/Driver/cuda-via-liboffload.cu (+23) 
- (modified) llvm/include/llvm/Frontend/OpenMP/OMPConstants.h (+1-1) 
- (modified) llvm/include/llvm/Frontend/OpenMP/OMPKinds.def (+1-1) 
- (modified) llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp (+3-1) 
- (modified) offload/include/Shared/APITypes.h (+27-15) 
- (modified) offload/include/omptarget.h (+17-3) 
- (modified) offload/plugins-nextgen/amdgpu/src/rtl.cpp (+23-10) 
- (modified) offload/plugins-nextgen/common/src/PluginInterface.cpp (+13-3) 
- (modified) offload/plugins-nextgen/cuda/src/rtl.cpp (+11-5) 
- (modified) offload/src/CMakeLists.txt (+1) 
- (added) offload/src/KernelLanguage/API.cpp (+86) 
- (modified) offload/src/exports (+5) 
- (modified) offload/src/interface.cpp (+48-1) 
- (modified) offload/src/omptarget.cpp (+1-1) 
- (modified) offload/test/lit.cfg (+1-1) 
- (added) offload/test/offloading/CUDA/basic_api_malloc_free.cu (+42) 
- (added) offload/test/offloading/CUDA/basic_api_memcpy.cu (+47) 
- (added) offload/test/offloading/CUDA/basic_api_memset.cu (+44) 
- (added) offload/test/offloading/CUDA/basic_launch.cu (+30) 
- (added) offload/test/offloading/CUDA/basic_launch_blocks_and_threads.cu (+32) 
- (added) offload/test/offloading/CUDA/basic_launch_multi_arg.cu (+39) 


``diff
diff --git a/clang/include/clang/Basic/LangOptions.def 
b/clang/include/clang/Basic/LangOptions.def
index 4061451b2150a..8aff98867202e 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -288,6 +288,7 @@ LANGOPT(GPUMaxThreadsPerBlock, 32, 1024, "default max 
threads per block for kern
 LANGOPT(GPUDeferDiag, 1, 0, "defer host/device related diagnostic messages for 
CUDA/HIP")
 LANGOPT(GPUExcludeWrongSideOverloads, 1, 0, "always exclude wrong side 
overloads in overloading resolution for CUDA/HIP")
 LANGOPT(OffloadingNewDriver, 1, 0, "use the new driver for generating 
offloading code.")
+LANGOPT(OffloadViaLLVM, 1, 0, "target LLVM/Offload as portable offloading 
runtime.")
 
 LANGOPT(SYCLIsDevice  , 1, 0, "Generate code for SYCL device")
 LANGOPT(SYCLIsHost, 1, 0, "SYCL host compilation")
diff --git a/clang/include/clang/Driver/Options.td 
b/clang/include/clang/Driver/Options.td
index 57f37c5023110..a09d75917ff98 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -1275,6 +1275,12 @@ def no_offload_compress : Flag<["--"], 
"no-offload-compress">;
 def offload_compression_level_EQ : Joined<["--"], 
"offload-compression-level=">,
   Flags<[HelpHidden]>,
   HelpText<"Compression level for offload device binaries (HIP only)">;
+
+defm offload_via_llvm : BoolFOption<"offload-via-llvm",
+  LangOpts<"OffloadViaLLVM">, DefaultFalse,
+  PosFlag,
+  NegFlag,
+  BothFlags<[], [ClangOption], " LLVM/Offload as portable offloading 
runtime.">>;
 }
 
 // CUDA options
diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index 43dfbbb90dd52..8e32aad88a26d 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -15,10 +15,12 @@
 #include "CGCXXABI.h"
 #include "CodeGenFunction.h"
 #include "CodeGenModule.h"
+#include "clang/AST/CharUnits.h"
 #include "clang/AST/Decl.h"
 #include "clang/Basic/Cuda.h"
 #include