[clang] [HIP] Replace use of `llvm-mc` with `clang` (PR #112041)

2024-10-11 Thread Joseph Huber via cfe-commits

https://github.com/jhuber6 updated 
https://github.com/llvm/llvm-project/pull/112041

>From 9de8a92c3bcda9d1fa414b9b355cb8ac77ae0812 Mon Sep 17 00:00:00 2001
From: Joseph Huber 
Date: Fri, 11 Oct 2024 14:53:17 -0500
Subject: [PATCH] [HIP] Replace use of `llvm-mc` with `clang`

Summary:
We currently use `llvm-mc` which is intended for internal testing and
not expected to be present in every installation. This patch changes
that to just use clang instead to get the `.o` from the HIP registration
code.

My preferred solution would be to use the new driver, but I still
haven't gotten the test suite to pass on this one weird OpenMP case.

Fixes: https://github.com/llvm/llvm-project/issues/112031
---
 clang/lib/Driver/ToolChains/HIPUtility.cpp| 15 ---
 clang/test/Driver/hip-link-save-temps.hip |  6 +++---
 clang/test/Driver/hip-partial-link.hip|  4 ++--
 clang/test/Driver/hip-save-temps.hip  |  2 +-
 clang/test/Driver/hip-toolchain-rdc-separate.hip  |  4 ++--
 .../test/Driver/hip-toolchain-rdc-static-lib.hip  |  2 +-
 clang/test/Driver/hip-toolchain-rdc.hip   |  2 +-
 clang/test/Driver/hip-unbundle-preproc.hipi   |  2 +-
 clang/test/Driver/hipspv-toolchain-rdc.hip|  3 +--
 9 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/HIPUtility.cpp 
b/clang/lib/Driver/ToolChains/HIPUtility.cpp
index b3adfe65402ff3..0dc9f7f8a54756 100644
--- a/clang/lib/Driver/ToolChains/HIPUtility.cpp
+++ b/clang/lib/Driver/ToolChains/HIPUtility.cpp
@@ -346,14 +346,14 @@ void HIP::constructGenerateObjFileFromHIPFatBinary(
   // Create Temp Object File Generator,
   // Offload Bundled file and Bundled Object file.
   // Keep them if save-temps is enabled.
-  const char *McinFile;
+  const char *ObjinFile;
   const char *BundleFile;
   if (C.getDriver().isSaveTempsEnabled()) {
-McinFile = C.getArgs().MakeArgString(Name + ".mcin");
+ObjinFile = C.getArgs().MakeArgString(Name + ".mcin");
 BundleFile = C.getArgs().MakeArgString(Name + ".hipfb");
   } else {
 auto TmpNameMcin = C.getDriver().GetTemporaryPath(Name, "mcin");
-McinFile = C.addTempFile(C.getArgs().MakeArgString(TmpNameMcin));
+ObjinFile = C.addTempFile(C.getArgs().MakeArgString(TmpNameMcin));
 auto TmpNameFb = C.getDriver().GetTemporaryPath(Name, "hipfb");
 BundleFile = C.addTempFile(C.getArgs().MakeArgString(TmpNameFb));
   }
@@ -454,7 +454,7 @@ void HIP::constructGenerateObjFileFromHIPFatBinary(
 
   // Open script file and write the contents.
   std::error_code EC;
-  llvm::raw_fd_ostream Objf(McinFile, EC, llvm::sys::fs::OF_None);
+  llvm::raw_fd_ostream Objf(ObjinFile, EC, llvm::sys::fs::OF_None);
 
   if (EC) {
 C.getDriver().Diag(clang::diag::err_unable_to_make_temp) << EC.message();
@@ -463,10 +463,11 @@ void HIP::constructGenerateObjFileFromHIPFatBinary(
 
   Objf << ObjBuffer;
 
-  ArgStringList McArgs{"-triple", Args.MakeArgString(HostTriple.normalize()),
+  ArgStringList McArgs{"-target", Args.MakeArgString(HostTriple.normalize()),
"-o",  Output.getFilename(),
-   McinFile,  "--filetype=obj"};
-  const char *Mc = Args.MakeArgString(TC.GetProgramPath("llvm-mc"));
+   "-x",  "assembler",
+   ObjinFile, "-c"};
+  const char *Mc = Args.MakeArgString(TC.GetProgramPath("clang"));
   C.addCommand(std::make_unique(JA, T, ResponseFileSupport::None(), 
Mc,
  McArgs, Inputs, Output));
 }
diff --git a/clang/test/Driver/hip-link-save-temps.hip 
b/clang/test/Driver/hip-link-save-temps.hip
index 5656614626b9cd..e321970274bb4b 100644
--- a/clang/test/Driver/hip-link-save-temps.hip
+++ b/clang/test/Driver/hip-link-save-temps.hip
@@ -39,10 +39,10 @@
 // CHECK-NOT: {{".*/opt"}}
 // CHECK-NOT: {{".*/llc"}}
 // CHECK: "{{.*lld.*}}" {{.*}} "-plugin-opt=-amdgpu-internalize-symbols"
-// CHECK-SAME: "-o" "a.out-hip-amdgcn-amd-amdhsa-gfx900" 
"obj1-hip-amdgcn-amd-amdhsa-gfx900.o" "obj2-hip-amdgcn-amd-amdhsa-gfx900.o"
+// CHECK-SAME: "-o" "[[HIPFB1:.+]]" "obj1-hip-amdgcn-amd-amdhsa-gfx900.o" 
"obj2-hip-amdgcn-amd-amdhsa-gfx900.o"
 // CHECK: "{{.*lld.*}}" {{.*}} "-plugin-opt=-amdgpu-internalize-symbols"
-// CHECK-SAME: "-o" "a.out-hip-amdgcn-amd-amdhsa-gfx906" 
"obj1-hip-amdgcn-amd-amdhsa-gfx906.o" "obj2-hip-amdgcn-amd-amdhsa-gfx906.o"
-// CHECK: {{".*llvm-mc.*"}} "-o" "[[OBJBUNDLE:.*.o]]" "{{.*}}.mcin" 
"--filetype=obj"
+// CHECK-SAME: "-o" "[[HIPFB2:.+]]" "obj1-hip-amdgcn-amd-amdhsa-gfx906.o" 
"obj2-hip-amdgcn-amd-amdhsa-gfx906.o"
+// CHECK: "{{.*clang.*}}" "-target" "x86_64-unknown-linux-gnu" "-o" 
"[[OBJBUNDLE:.+.o]]" "-x" "assembler" "{{.*}}.mcin" "-c"
 // OUT: "{{.*ld.*}}" {{.*}} "-o" "executable" {{.*}} "[[OBJBUNDLE]]"
 // NOUT: "{{.*ld.*}}" {{.*}} "-o" "a.out" {{.*}} "[[OBJBUNDLE]]"
 // SLO: "{{.*llvm-ar.*}}" "rcsD" "libTest.a" {{.*}} "[[OBJBUNDLE]]"
diff --git a/clang/test/Driver/hip-partial-link.hip 
b/clang/te

[clang] [HIP] Replace use of `llvm-mc` with `clang` (PR #112041)

2024-10-11 Thread Joseph Huber via cfe-commits

https://github.com/jhuber6 created 
https://github.com/llvm/llvm-project/pull/112041

Summary:
We currently use `llvm-mc` which is intended for internal testing and
not expected to be present in every installation. This patch changes
that to just use clang instead to get the `.o` from the HIP registration
code.

My preferred solution would be to use the new driver, but I still
haven't gotten the test suite to pass on this one weird OpenMP case.

Fixes: https://github.com/llvm/llvm-project/issues/112031


>From 84a8f05136c71a4fbc00062ffe1d33575336cbdc Mon Sep 17 00:00:00 2001
From: Joseph Huber 
Date: Fri, 11 Oct 2024 14:53:17 -0500
Subject: [PATCH] [HIP] Replace use of `llvm-mc` with `clang`

Summary:
We currently use `llvm-mc` which is intended for internal testing and
not expected to be present in every installation. This patch changes
that to just use clang instead to get the `.o` from the HIP registration
code.

My preferred solution would be to use the new driver, but I still
haven't gotten the test suite to pass on this one weird OpenMP case.

Fixes: https://github.com/llvm/llvm-project/issues/112031
---
 clang/lib/Driver/ToolChains/HIPUtility.cpp   | 16 
 clang/test/Driver/hip-link-save-temps.hip|  6 +++---
 clang/test/Driver/hip-partial-link.hip   |  4 ++--
 clang/test/Driver/hip-save-temps.hip |  2 +-
 clang/test/Driver/hip-toolchain-rdc-separate.hip |  4 ++--
 .../test/Driver/hip-toolchain-rdc-static-lib.hip |  2 +-
 clang/test/Driver/hip-toolchain-rdc.hip  |  2 +-
 clang/test/Driver/hip-unbundle-preproc.hipi  |  2 +-
 clang/test/Driver/hipspv-toolchain-rdc.hip   |  3 +--
 9 files changed, 20 insertions(+), 21 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/HIPUtility.cpp 
b/clang/lib/Driver/ToolChains/HIPUtility.cpp
index b3adfe65402ff3..ffeea269a068ff 100644
--- a/clang/lib/Driver/ToolChains/HIPUtility.cpp
+++ b/clang/lib/Driver/ToolChains/HIPUtility.cpp
@@ -346,14 +346,14 @@ void HIP::constructGenerateObjFileFromHIPFatBinary(
   // Create Temp Object File Generator,
   // Offload Bundled file and Bundled Object file.
   // Keep them if save-temps is enabled.
-  const char *McinFile;
+  const char *ObjinFile;
   const char *BundleFile;
   if (C.getDriver().isSaveTempsEnabled()) {
-McinFile = C.getArgs().MakeArgString(Name + ".mcin");
+ObjinFile = C.getArgs().MakeArgString(Name + ".mcin");
 BundleFile = C.getArgs().MakeArgString(Name + ".hipfb");
   } else {
 auto TmpNameMcin = C.getDriver().GetTemporaryPath(Name, "mcin");
-McinFile = C.addTempFile(C.getArgs().MakeArgString(TmpNameMcin));
+ObjinFile = C.addTempFile(C.getArgs().MakeArgString(TmpNameMcin));
 auto TmpNameFb = C.getDriver().GetTemporaryPath(Name, "hipfb");
 BundleFile = C.addTempFile(C.getArgs().MakeArgString(TmpNameFb));
   }
@@ -454,7 +454,7 @@ void HIP::constructGenerateObjFileFromHIPFatBinary(
 
   // Open script file and write the contents.
   std::error_code EC;
-  llvm::raw_fd_ostream Objf(McinFile, EC, llvm::sys::fs::OF_None);
+  llvm::raw_fd_ostream Objf(ObjinFile, EC, llvm::sys::fs::OF_None);
 
   if (EC) {
 C.getDriver().Diag(clang::diag::err_unable_to_make_temp) << EC.message();
@@ -463,10 +463,10 @@ void HIP::constructGenerateObjFileFromHIPFatBinary(
 
   Objf << ObjBuffer;
 
-  ArgStringList McArgs{"-triple", Args.MakeArgString(HostTriple.normalize()),
-   "-o",  Output.getFilename(),
-   McinFile,  "--filetype=obj"};
-  const char *Mc = Args.MakeArgString(TC.GetProgramPath("llvm-mc"));
+  ArgStringList McArgs{"-target", Args.MakeArgString(HostTriple.normalize()),
+   "-o",  Output.getFilename(), "-x", "assembler",
+   ObjinFile,  "-c"};
+  const char *Mc = Args.MakeArgString(TC.GetProgramPath("clang"));
   C.addCommand(std::make_unique(JA, T, ResponseFileSupport::None(), 
Mc,
  McArgs, Inputs, Output));
 }
diff --git a/clang/test/Driver/hip-link-save-temps.hip 
b/clang/test/Driver/hip-link-save-temps.hip
index 5656614626b9cd..e321970274bb4b 100644
--- a/clang/test/Driver/hip-link-save-temps.hip
+++ b/clang/test/Driver/hip-link-save-temps.hip
@@ -39,10 +39,10 @@
 // CHECK-NOT: {{".*/opt"}}
 // CHECK-NOT: {{".*/llc"}}
 // CHECK: "{{.*lld.*}}" {{.*}} "-plugin-opt=-amdgpu-internalize-symbols"
-// CHECK-SAME: "-o" "a.out-hip-amdgcn-amd-amdhsa-gfx900" 
"obj1-hip-amdgcn-amd-amdhsa-gfx900.o" "obj2-hip-amdgcn-amd-amdhsa-gfx900.o"
+// CHECK-SAME: "-o" "[[HIPFB1:.+]]" "obj1-hip-amdgcn-amd-amdhsa-gfx900.o" 
"obj2-hip-amdgcn-amd-amdhsa-gfx900.o"
 // CHECK: "{{.*lld.*}}" {{.*}} "-plugin-opt=-amdgpu-internalize-symbols"
-// CHECK-SAME: "-o" "a.out-hip-amdgcn-amd-amdhsa-gfx906" 
"obj1-hip-amdgcn-amd-amdhsa-gfx906.o" "obj2-hip-amdgcn-amd-amdhsa-gfx906.o"
-// CHECK: {{".*llvm-mc.*"}} "-o" "[[OBJBUNDLE:.*.o]]" "{{.*}}.mcin" 
"--filetype=obj"
+// CHECK-SAME: "-o" "[[HIPFB2:.+]]" "obj1-hip-amdgcn-amd-amdhsa

[clang] clang: Add llvm-mc to CLANG_TEST_DEPS (PR #112032)

2024-10-11 Thread Joseph Huber via cfe-commits

https://github.com/jhuber6 approved this pull request.


https://github.com/llvm/llvm-project/pull/112032
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang] Add a flag to include GPU startup files (PR #112025)

2024-10-11 Thread Joseph Huber via cfe-commits

https://github.com/jhuber6 edited 
https://github.com/llvm/llvm-project/pull/112025
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang] Add a flag to include GPU startup files (PR #112025)

2024-10-11 Thread Joseph Huber via cfe-commits

https://github.com/jhuber6 updated 
https://github.com/llvm/llvm-project/pull/112025

>From a014a06d5b06bb8c94b4d86636fb57f342e184db Mon Sep 17 00:00:00 2001
From: Joseph Huber 
Date: Fri, 11 Oct 2024 12:21:49 -0500
Subject: [PATCH 1/3] [Clang] Add a flag to include GPU startup files

Summary:
The C library for GPUs provides the ability to target regular C/C++
programs by providing the C library and a file containing kernels that
call the `main` function. This is mostly used for unit tests, this patch
provides a quick way to add them without needing to know the paths. I
currently do this explicitly, but according to the libc++ contributors
we don't want to need to specify these paths manually. See the
discussion in https://github.com/llvm/llvm-project/pull/104515.

I just default to `lib/` if the target-specific one isn't found because
the linker will handle giving a reasonable error message if it's not
found. Basically the use-case looks like this.

```console
$ clang test.c --target=amdgcn-amd-amdhsa -mcpu=native -gpustartfiles
$ amdhsa-loader a.out
PASS!
```
---
 clang/include/clang/Driver/Options.td  | 3 +++
 clang/lib/Driver/ToolChains/AMDGPU.cpp | 9 +
 clang/lib/Driver/ToolChains/Cuda.cpp   | 9 +
 clang/test/Driver/gpustartfiles.c  | 7 +++
 4 files changed, 28 insertions(+)
 create mode 100644 clang/test/Driver/gpustartfiles.c

diff --git a/clang/include/clang/Driver/Options.td 
b/clang/include/clang/Driver/Options.td
index d306c751505e98..b7f7a7cb47f754 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -1316,6 +1316,9 @@ defm offload_via_llvm : BoolFOption<"offload-via-llvm",
   BothFlags<[], [ClangOption], " LLVM/Offload as portable offloading 
runtime.">>;
 }
 
+def gpustartfiles : Flag<["-"], "gpustartfiles">, Group,
+  HelpText<"Link the GPU C startup utilities automatically, used for 
testing.">;
+
 // CUDA options
 let Group = cuda_Group in {
 def cuda_include_ptx_EQ : Joined<["--"], "cuda-include-ptx=">,
diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp 
b/clang/lib/Driver/ToolChains/AMDGPU.cpp
index 2c85d21ebd738c..9a648be4ea3915 100644
--- a/clang/lib/Driver/ToolChains/AMDGPU.cpp
+++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp
@@ -648,6 +648,15 @@ void amdgpu::Linker::ConstructJob(Compilation &C, const 
JobAction &JA,
 Args.MakeArgString("-plugin-opt=-mattr=" + llvm::join(Features, ",")));
   }
 
+  if (Args.hasArg(options::OPT_gpustartfiles)) {
+auto IncludePath = getToolChain().getStdlibPath();
+if (!IncludePath)
+  IncludePath = "/lib";
+SmallString<128> P(*IncludePath);
+llvm::sys::path::append(P, "crt1.o");
+CmdArgs.append({"-lc", "-lm", Args.MakeArgString(P)});
+  }
+
   CmdArgs.push_back("-o");
   CmdArgs.push_back(Output.getFilename());
   C.addCommand(std::make_unique(
diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp 
b/clang/lib/Driver/ToolChains/Cuda.cpp
index 7a70cf1c5694fd..ff96ff989db630 100644
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -641,6 +641,15 @@ void NVPTX::Linker::ConstructJob(Compilation &C, const 
JobAction &JA,
   llvm::sys::path::append(DefaultLibPath, CLANG_INSTALL_LIBDIR_BASENAME);
   CmdArgs.push_back(Args.MakeArgString(Twine("-L") + DefaultLibPath));
 
+  if (Args.hasArg(options::OPT_gpustartfiles)) {
+auto IncludePath = getToolChain().getStdlibPath();
+if (!IncludePath)
+  IncludePath = "/lib";
+SmallString<128> P(*IncludePath);
+llvm::sys::path::append(P, "crt1.o");
+CmdArgs.append({"-lc", "-lm", Args.MakeArgString(P)});
+  }
+
   C.addCommand(std::make_unique(
   JA, *this,
   ResponseFileSupport{ResponseFileSupport::RF_Full, llvm::sys::WEM_UTF8,
diff --git a/clang/test/Driver/gpustartfiles.c 
b/clang/test/Driver/gpustartfiles.c
new file mode 100644
index 00..c1b7a6fa922df4
--- /dev/null
+++ b/clang/test/Driver/gpustartfiles.c
@@ -0,0 +1,7 @@
+// RUN: %clang -target nvptx64-nvidia-cuda -march=sm_61 -gpustartfiles \
+// RUN:   -nogpulib -nogpuinc -### %s 2>&1 | FileCheck -check-prefix=NVPTX %s
+// NVPTX: clang-nvlink-wrapper{{.*}}"-lc" "-lm" "{{.*}}crt1.o"
+//
+// RUN: %clang -target amdgcn-amd-amdhsa -march=gfx90a -gpustartfiles \
+// RUN:   -nogpulib -nogpuinc -### %s 2>&1 | FileCheck -check-prefix=AMDGPU %s
+// AMDGPU: ld.lld{{.*}}"-lc" "-lm" "{{.*}}crt1.o"

>From 7a7dc3b39e5b3109c0dbcaa59eab67219b5174dd Mon Sep 17 00:00:00 2001
From: Joseph Huber 
Date: Fri, 11 Oct 2024 12:52:49 -0500
Subject: [PATCH 2/3] Add -startfiles -stdlib positive versions for GPU

---
 clang/include/clang/Driver/Options.td  | 7 ---
 clang/lib/Driver/ToolChains/AMDGPU.cpp | 6 --
 clang/lib/Driver/ToolChains/Cuda.cpp   | 6 --
 clang/test/Driver/gpustartfiles.c  | 4 ++--
 4 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/clang/include/clang/Driver/Options.td 
b/clang/include/clang/Driver/Options.td
index b7f7a7cb47f754..63df354f3ce210 100644
--- a/clang/include

[clang] [Clang] Add a flag to include GPU startup files (PR #112025)

2024-10-11 Thread Joseph Huber via cfe-commits


@@ -648,6 +648,15 @@ void amdgpu::Linker::ConstructJob(Compilation &C, const 
JobAction &JA,
 Args.MakeArgString("-plugin-opt=-mattr=" + llvm::join(Features, ",")));
   }
 
+  if (Args.hasArg(options::OPT_gpustartfiles)) {

jhuber6 wrote:

Done.

https://github.com/llvm/llvm-project/pull/112025
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang] Add a flag to include GPU startup files (PR #112025)

2024-10-11 Thread Joseph Huber via cfe-commits

https://github.com/jhuber6 updated 
https://github.com/llvm/llvm-project/pull/112025

>From a014a06d5b06bb8c94b4d86636fb57f342e184db Mon Sep 17 00:00:00 2001
From: Joseph Huber 
Date: Fri, 11 Oct 2024 12:21:49 -0500
Subject: [PATCH 1/2] [Clang] Add a flag to include GPU startup files

Summary:
The C library for GPUs provides the ability to target regular C/C++
programs by providing the C library and a file containing kernels that
call the `main` function. This is mostly used for unit tests, this patch
provides a quick way to add them without needing to know the paths. I
currently do this explicitly, but according to the libc++ contributors
we don't want to need to specify these paths manually. See the
discussion in https://github.com/llvm/llvm-project/pull/104515.

I just default to `lib/` if the target-specific one isn't found because
the linker will handle giving a reasonable error message if it's not
found. Basically the use-case looks like this.

```console
$ clang test.c --target=amdgcn-amd-amdhsa -mcpu=native -gpustartfiles
$ amdhsa-loader a.out
PASS!
```
---
 clang/include/clang/Driver/Options.td  | 3 +++
 clang/lib/Driver/ToolChains/AMDGPU.cpp | 9 +
 clang/lib/Driver/ToolChains/Cuda.cpp   | 9 +
 clang/test/Driver/gpustartfiles.c  | 7 +++
 4 files changed, 28 insertions(+)
 create mode 100644 clang/test/Driver/gpustartfiles.c

diff --git a/clang/include/clang/Driver/Options.td 
b/clang/include/clang/Driver/Options.td
index d306c751505e98..b7f7a7cb47f754 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -1316,6 +1316,9 @@ defm offload_via_llvm : BoolFOption<"offload-via-llvm",
   BothFlags<[], [ClangOption], " LLVM/Offload as portable offloading 
runtime.">>;
 }
 
+def gpustartfiles : Flag<["-"], "gpustartfiles">, Group,
+  HelpText<"Link the GPU C startup utilities automatically, used for 
testing.">;
+
 // CUDA options
 let Group = cuda_Group in {
 def cuda_include_ptx_EQ : Joined<["--"], "cuda-include-ptx=">,
diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp 
b/clang/lib/Driver/ToolChains/AMDGPU.cpp
index 2c85d21ebd738c..9a648be4ea3915 100644
--- a/clang/lib/Driver/ToolChains/AMDGPU.cpp
+++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp
@@ -648,6 +648,15 @@ void amdgpu::Linker::ConstructJob(Compilation &C, const 
JobAction &JA,
 Args.MakeArgString("-plugin-opt=-mattr=" + llvm::join(Features, ",")));
   }
 
+  if (Args.hasArg(options::OPT_gpustartfiles)) {
+auto IncludePath = getToolChain().getStdlibPath();
+if (!IncludePath)
+  IncludePath = "/lib";
+SmallString<128> P(*IncludePath);
+llvm::sys::path::append(P, "crt1.o");
+CmdArgs.append({"-lc", "-lm", Args.MakeArgString(P)});
+  }
+
   CmdArgs.push_back("-o");
   CmdArgs.push_back(Output.getFilename());
   C.addCommand(std::make_unique(
diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp 
b/clang/lib/Driver/ToolChains/Cuda.cpp
index 7a70cf1c5694fd..ff96ff989db630 100644
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -641,6 +641,15 @@ void NVPTX::Linker::ConstructJob(Compilation &C, const 
JobAction &JA,
   llvm::sys::path::append(DefaultLibPath, CLANG_INSTALL_LIBDIR_BASENAME);
   CmdArgs.push_back(Args.MakeArgString(Twine("-L") + DefaultLibPath));
 
+  if (Args.hasArg(options::OPT_gpustartfiles)) {
+auto IncludePath = getToolChain().getStdlibPath();
+if (!IncludePath)
+  IncludePath = "/lib";
+SmallString<128> P(*IncludePath);
+llvm::sys::path::append(P, "crt1.o");
+CmdArgs.append({"-lc", "-lm", Args.MakeArgString(P)});
+  }
+
   C.addCommand(std::make_unique(
   JA, *this,
   ResponseFileSupport{ResponseFileSupport::RF_Full, llvm::sys::WEM_UTF8,
diff --git a/clang/test/Driver/gpustartfiles.c 
b/clang/test/Driver/gpustartfiles.c
new file mode 100644
index 00..c1b7a6fa922df4
--- /dev/null
+++ b/clang/test/Driver/gpustartfiles.c
@@ -0,0 +1,7 @@
+// RUN: %clang -target nvptx64-nvidia-cuda -march=sm_61 -gpustartfiles \
+// RUN:   -nogpulib -nogpuinc -### %s 2>&1 | FileCheck -check-prefix=NVPTX %s
+// NVPTX: clang-nvlink-wrapper{{.*}}"-lc" "-lm" "{{.*}}crt1.o"
+//
+// RUN: %clang -target amdgcn-amd-amdhsa -march=gfx90a -gpustartfiles \
+// RUN:   -nogpulib -nogpuinc -### %s 2>&1 | FileCheck -check-prefix=AMDGPU %s
+// AMDGPU: ld.lld{{.*}}"-lc" "-lm" "{{.*}}crt1.o"

>From 7a7dc3b39e5b3109c0dbcaa59eab67219b5174dd Mon Sep 17 00:00:00 2001
From: Joseph Huber 
Date: Fri, 11 Oct 2024 12:52:49 -0500
Subject: [PATCH 2/2] Add -startfiles -stdlib positive versions for GPU

---
 clang/include/clang/Driver/Options.td  | 7 ---
 clang/lib/Driver/ToolChains/AMDGPU.cpp | 6 --
 clang/lib/Driver/ToolChains/Cuda.cpp   | 6 --
 clang/test/Driver/gpustartfiles.c  | 4 ++--
 4 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/clang/include/clang/Driver/Options.td 
b/clang/include/clang/Driver/Options.td
index b7f7a7cb47f754..63df354f3ce210 100644
--- a/clang/include

[clang] [Clang] Add a flag to include GPU startup files (PR #112025)

2024-10-11 Thread Joseph Huber via cfe-commits


@@ -648,6 +648,15 @@ void amdgpu::Linker::ConstructJob(Compilation &C, const 
JobAction &JA,
 Args.MakeArgString("-plugin-opt=-mattr=" + llvm::join(Features, ",")));
   }
 
+  if (Args.hasArg(options::OPT_gpustartfiles)) {

jhuber6 wrote:

What I meant, but that's a much bigger change. Maybe I could get away with only 
using the positive variant in the GPU toolchain?

https://github.com/llvm/llvm-project/pull/112025
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang] Add a flag to include GPU startup files (PR #112025)

2024-10-11 Thread Joseph Huber via cfe-commits


@@ -648,6 +648,15 @@ void amdgpu::Linker::ConstructJob(Compilation &C, const 
JobAction &JA,
 Args.MakeArgString("-plugin-opt=-mattr=" + llvm::join(Features, ",")));
   }
 
+  if (Args.hasArg(options::OPT_gpustartfiles)) {

jhuber6 wrote:

Unfortunately not, there's `-nostartfiles` and `-nostdlib`, but nothing to 
invert that. Adding one would probably add a few hundred changes of `hasArg` to 
`hasFlag`. The other targets just assume they're hosted, and I can't just link 
calls to `main` by default since no one wants that unless they're trying to 
write tests or something.

https://github.com/llvm/llvm-project/pull/112025
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang] Add a flag to include GPU startup files (PR #112025)

2024-10-11 Thread Joseph Huber via cfe-commits

https://github.com/jhuber6 created 
https://github.com/llvm/llvm-project/pull/112025

Summary:
The C library for GPUs provides the ability to target regular C/C++
programs by providing the C library and a file containing kernels that
call the `main` function. This is mostly used for unit tests, this patch
provides a quick way to add them without needing to know the paths. I
currently do this explicitly, but according to the libc++ contributors
we don't want to need to specify these paths manually. See the
discussion in https://github.com/llvm/llvm-project/pull/104515.

I just default to `lib/` if the target-specific one isn't found because
the linker will handle giving a reasonable error message if it's not
found. Basically the use-case looks like this.

```console
$ clang test.c --target=amdgcn-amd-amdhsa -mcpu=native -gpustartfiles
$ amdhsa-loader a.out
PASS!
```


>From a014a06d5b06bb8c94b4d86636fb57f342e184db Mon Sep 17 00:00:00 2001
From: Joseph Huber 
Date: Fri, 11 Oct 2024 12:21:49 -0500
Subject: [PATCH] [Clang] Add a flag to include GPU startup files

Summary:
The C library for GPUs provides the ability to target regular C/C++
programs by providing the C library and a file containing kernels that
call the `main` function. This is mostly used for unit tests, this patch
provides a quick way to add them without needing to know the paths. I
currently do this explicitly, but according to the libc++ contributors
we don't want to need to specify these paths manually. See the
discussion in https://github.com/llvm/llvm-project/pull/104515.

I just default to `lib/` if the target-specific one isn't found because
the linker will handle giving a reasonable error message if it's not
found. Basically the use-case looks like this.

```console
$ clang test.c --target=amdgcn-amd-amdhsa -mcpu=native -gpustartfiles
$ amdhsa-loader a.out
PASS!
```
---
 clang/include/clang/Driver/Options.td  | 3 +++
 clang/lib/Driver/ToolChains/AMDGPU.cpp | 9 +
 clang/lib/Driver/ToolChains/Cuda.cpp   | 9 +
 clang/test/Driver/gpustartfiles.c  | 7 +++
 4 files changed, 28 insertions(+)
 create mode 100644 clang/test/Driver/gpustartfiles.c

diff --git a/clang/include/clang/Driver/Options.td 
b/clang/include/clang/Driver/Options.td
index d306c751505e98..b7f7a7cb47f754 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -1316,6 +1316,9 @@ defm offload_via_llvm : BoolFOption<"offload-via-llvm",
   BothFlags<[], [ClangOption], " LLVM/Offload as portable offloading 
runtime.">>;
 }
 
+def gpustartfiles : Flag<["-"], "gpustartfiles">, Group,
+  HelpText<"Link the GPU C startup utilities automatically, used for 
testing.">;
+
 // CUDA options
 let Group = cuda_Group in {
 def cuda_include_ptx_EQ : Joined<["--"], "cuda-include-ptx=">,
diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp 
b/clang/lib/Driver/ToolChains/AMDGPU.cpp
index 2c85d21ebd738c..9a648be4ea3915 100644
--- a/clang/lib/Driver/ToolChains/AMDGPU.cpp
+++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp
@@ -648,6 +648,15 @@ void amdgpu::Linker::ConstructJob(Compilation &C, const 
JobAction &JA,
 Args.MakeArgString("-plugin-opt=-mattr=" + llvm::join(Features, ",")));
   }
 
+  if (Args.hasArg(options::OPT_gpustartfiles)) {
+auto IncludePath = getToolChain().getStdlibPath();
+if (!IncludePath)
+  IncludePath = "/lib";
+SmallString<128> P(*IncludePath);
+llvm::sys::path::append(P, "crt1.o");
+CmdArgs.append({"-lc", "-lm", Args.MakeArgString(P)});
+  }
+
   CmdArgs.push_back("-o");
   CmdArgs.push_back(Output.getFilename());
   C.addCommand(std::make_unique(
diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp 
b/clang/lib/Driver/ToolChains/Cuda.cpp
index 7a70cf1c5694fd..ff96ff989db630 100644
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -641,6 +641,15 @@ void NVPTX::Linker::ConstructJob(Compilation &C, const 
JobAction &JA,
   llvm::sys::path::append(DefaultLibPath, CLANG_INSTALL_LIBDIR_BASENAME);
   CmdArgs.push_back(Args.MakeArgString(Twine("-L") + DefaultLibPath));
 
+  if (Args.hasArg(options::OPT_gpustartfiles)) {
+auto IncludePath = getToolChain().getStdlibPath();
+if (!IncludePath)
+  IncludePath = "/lib";
+SmallString<128> P(*IncludePath);
+llvm::sys::path::append(P, "crt1.o");
+CmdArgs.append({"-lc", "-lm", Args.MakeArgString(P)});
+  }
+
   C.addCommand(std::make_unique(
   JA, *this,
   ResponseFileSupport{ResponseFileSupport::RF_Full, llvm::sys::WEM_UTF8,
diff --git a/clang/test/Driver/gpustartfiles.c 
b/clang/test/Driver/gpustartfiles.c
new file mode 100644
index 00..c1b7a6fa922df4
--- /dev/null
+++ b/clang/test/Driver/gpustartfiles.c
@@ -0,0 +1,7 @@
+// RUN: %clang -target nvptx64-nvidia-cuda -march=sm_61 -gpustartfiles \
+// RUN:   -nogpulib -nogpuinc -### %s 2>&1 | FileCheck -check-prefix=NVPTX %s
+// NVPTX: clang-nvlink-wrapper{{.*}}"-lc" "-lm" "{{.*}}crt1.o"
+//
+// RUN: %clang -target amdgcn-

[clang] [llvm] [Clang] Put offloading globals in the `.llvm.rodata.offloading` section (PR #111890)

2024-10-10 Thread Joseph Huber via cfe-commits

https://github.com/jhuber6 updated 
https://github.com/llvm/llvm-project/pull/111890

>From db58755323a6538c7a65bbdc323c5718dbc89dcb Mon Sep 17 00:00:00 2001
From: Joseph Huber 
Date: Thu, 10 Oct 2024 13:42:22 -0500
Subject: [PATCH] [Clang] Put offloading globals in the
 `.llvm.rodata.offloading` section

Summary:
For our offloading entries, we currently store all the string names of
kernels that the runtime will need to load from the target executable.
These are available via pointer in the `__tgt_offload_entry` struct,
however this makes it difficult to obtain from the object itself. This
patch simply puts the strings in a named section so they can be easily
queried.

The motivation behind this is that when the linker wrapper is doing
linking, it wants to know which kernels the host executable is calling.
We *could* get this already via the `.relaomp_offloading_entires`
section and trawling through the string table, but that's quite annoying
and not portable. The follow-up to this should be to make the linker
wrapper get a list of all used symbols the device link job should count
as "needed" so we can handle static linking more directly.
---
 clang/test/CodeGenCUDA/offloading-entries.cu | 65 ++--
 llvm/lib/Frontend/Offloading/Utility.cpp |  8 +++
 2 files changed, 53 insertions(+), 20 deletions(-)

diff --git a/clang/test/CodeGenCUDA/offloading-entries.cu 
b/clang/test/CodeGenCUDA/offloading-entries.cu
index ec21f018607ff0..259e3324e8ac94 100644
--- a/clang/test/CodeGenCUDA/offloading-entries.cu
+++ b/clang/test/CodeGenCUDA/offloading-entries.cu
@@ -15,48 +15,48 @@
 #include "Inputs/cuda.h"
 
 //.
-// CUDA: @.offloading.entry_name = internal unnamed_addr constant [8 x i8] 
c"_Z3foov\00"
+// CUDA: @.offloading.entry_name = internal unnamed_addr constant [8 x i8] 
c"_Z3foov\00", section ".llvm.rodata.offloading", align 1
 // CUDA: @.offloading.entry._Z3foov = weak constant 
%struct.__tgt_offload_entry { ptr @_Z18__device_stub__foov, ptr 
@.offloading.entry_name, i64 0, i32 0, i32 0 }, section 
"cuda_offloading_entries", align 1
-// CUDA: @.offloading.entry_name.1 = internal unnamed_addr constant [11 x i8] 
c"_Z6kernelv\00"
+// CUDA: @.offloading.entry_name.1 = internal unnamed_addr constant [11 x i8] 
c"_Z6kernelv\00", section ".llvm.rodata.offloading", align 1
 // CUDA: @.offloading.entry._Z6kernelv = weak constant 
%struct.__tgt_offload_entry { ptr @_Z21__device_stub__kernelv, ptr 
@.offloading.entry_name.1, i64 0, i32 0, i32 0 }, section 
"cuda_offloading_entries", align 1
-// CUDA: @.offloading.entry_name.2 = internal unnamed_addr constant [4 x i8] 
c"var\00"
+// CUDA: @.offloading.entry_name.2 = internal unnamed_addr constant [4 x i8] 
c"var\00", section ".llvm.rodata.offloading", align 1
 // CUDA: @.offloading.entry.var = weak constant %struct.__tgt_offload_entry { 
ptr @var, ptr @.offloading.entry_name.2, i64 4, i32 0, i32 0 }, section 
"cuda_offloading_entries", align 1
-// CUDA: @.offloading.entry_name.3 = internal unnamed_addr constant [5 x i8] 
c"surf\00"
+// CUDA: @.offloading.entry_name.3 = internal unnamed_addr constant [5 x i8] 
c"surf\00", section ".llvm.rodata.offloading", align 1
 // CUDA: @.offloading.entry.surf = weak constant %struct.__tgt_offload_entry { 
ptr @surf, ptr @.offloading.entry_name.3, i64 4, i32 2, i32 1 }, section 
"cuda_offloading_entries", align 1
-// CUDA: @.offloading.entry_name.4 = internal unnamed_addr constant [4 x i8] 
c"tex\00"
+// CUDA: @.offloading.entry_name.4 = internal unnamed_addr constant [4 x i8] 
c"tex\00", section ".llvm.rodata.offloading", align 1
 // CUDA: @.offloading.entry.tex = weak constant %struct.__tgt_offload_entry { 
ptr @tex, ptr @.offloading.entry_name.4, i64 4, i32 3, i32 1 }, section 
"cuda_offloading_entries", align 1
 //.
-// HIP: @.offloading.entry_name = internal unnamed_addr constant [8 x i8] 
c"_Z3foov\00"
+// HIP: @.offloading.entry_name = internal unnamed_addr constant [8 x i8] 
c"_Z3foov\00", section ".llvm.rodata.offloading", align 1
 // HIP: @.offloading.entry._Z3foov = weak constant %struct.__tgt_offload_entry 
{ ptr @_Z3foov, ptr @.offloading.entry_name, i64 0, i32 0, i32 0 }, section 
"hip_offloading_entries", align 1
-// HIP: @.offloading.entry_name.1 = internal unnamed_addr constant [11 x i8] 
c"_Z6kernelv\00"
+// HIP: @.offloading.entry_name.1 = internal unnamed_addr constant [11 x i8] 
c"_Z6kernelv\00", section ".llvm.rodata.offloading", align 1
 // HIP: @.offloading.entry._Z6kernelv = weak constant 
%struct.__tgt_offload_entry { ptr @_Z6kernelv, ptr @.offloading.entry_name.1, 
i64 0, i32 0, i32 0 }, section "hip_offloading_entries", align 1
-// HIP: @.offloading.entry_name.2 = internal unnamed_addr constant [4 x i8] 
c"var\00"
+// HIP: @.offloading.entry_name.2 = internal unnamed_addr constant [4 x i8] 
c"var\00", section ".llvm.rodata.offloading", align 1
 // HIP: @.offloading.entry.var = weak constant %struct.__tgt_offload_entry { 
ptr @var, ptr @.offloading.entry_name.2, i64 4, i32 0, i32 0 }, sec

[clang] [Cuda] Handle -fcuda-short-ptr even with -nocudalib (PR #111682)

2024-10-10 Thread Joseph Huber via cfe-commits

jhuber6 wrote:

We don't need marshalling because this isn't a cc1 option. This is just handled 
by the driver which forwards it as `-mllvm` to the backend. You'd need to 
update the LLVM option to take multiple options and then make the clang driver 
option pick between them.

https://github.com/llvm/llvm-project/pull/111682
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [NvlinkWrapper] Use `-plugin-opt=mattr=` instead of a custom feature (PR #111712)

2024-10-09 Thread Joseph Huber via cfe-commits

https://github.com/jhuber6 created 
https://github.com/llvm/llvm-project/pull/111712

Summary:
We don't need a custom flag for this, LLVM had a way to get the features
which are forwarded via `plugin-opt`.


>From 6135a70a45801783a770252e765773905f18313d Mon Sep 17 00:00:00 2001
From: Joseph Huber 
Date: Wed, 9 Oct 2024 11:42:55 -0500
Subject: [PATCH] [NvlinkWrapper] Use `-plugin-opt=mattr=` instead of a custom
 feature

Summary:
We don't need a custom flag for this, LLVM had a way to get the features
which are forwarded via `plugin-opt`.
---
 clang/lib/Driver/ToolChains/Cuda.cpp| 4 ++--
 clang/test/Driver/cuda-cross-compiling.c| 2 +-
 clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp | 2 +-
 clang/tools/clang-nvlink-wrapper/NVLinkOpts.td  | 3 ---
 4 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp 
b/clang/lib/Driver/ToolChains/Cuda.cpp
index 509cd87b28c37e..dfcd20a73f1d54 100644
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -632,8 +632,8 @@ void NVPTX::Linker::ConstructJob(Compilation &C, const 
JobAction &JA,
   std::vector Features;
   getNVPTXTargetFeatures(C.getDriver(), getToolChain().getTriple(), Args,
  Features);
-  for (StringRef Feature : Features)
-CmdArgs.append({"--feature", Args.MakeArgString(Feature)});
+  CmdArgs.push_back(
+  Args.MakeArgString("--plugin-opt=mattr=" + llvm::join(Features, ",")));
 
   // Add paths for the default clang library path.
   SmallString<256> DefaultLibPath =
diff --git a/clang/test/Driver/cuda-cross-compiling.c 
b/clang/test/Driver/cuda-cross-compiling.c
index 5f24e7a5accb08..54c291fac66ffd 100644
--- a/clang/test/Driver/cuda-cross-compiling.c
+++ b/clang/test/Driver/cuda-cross-compiling.c
@@ -104,4 +104,4 @@
 // RUN: %clang -target nvptx64-nvidia-cuda --cuda-feature=+ptx63 -march=sm_52 
-### %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=FEATURE %s
 
-// FEATURE: clang-nvlink-wrapper{{.*}}"--feature" "+ptx63"
+// FEATURE: clang-nvlink-wrapper{{.*}}"--plugin-opt=mattr=+ptx63"
diff --git a/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp 
b/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp
index b4b376fe0d114e..b9767a7a03d0b5 100644
--- a/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp
+++ b/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp
@@ -344,7 +344,7 @@ Expected> createLTO(const ArgList 
&Args) {
   Conf.RemarksHotnessThreshold = RemarksHotnessThreshold;
   Conf.RemarksFormat = RemarksFormat;
 
-  Conf.MAttrs = {Args.getLastArgValue(OPT_feature, "").str()};
+  Conf.MAttrs = llvm::codegen::getMAttrs();
   std::optional CGOptLevelOrNone =
   CodeGenOpt::parseLevel(Args.getLastArgValue(OPT_O, "2")[0]);
   assert(CGOptLevelOrNone && "Invalid optimization level");
diff --git a/clang/tools/clang-nvlink-wrapper/NVLinkOpts.td 
b/clang/tools/clang-nvlink-wrapper/NVLinkOpts.td
index eeb9d1a6228240..a80c5937b42992 100644
--- a/clang/tools/clang-nvlink-wrapper/NVLinkOpts.td
+++ b/clang/tools/clang-nvlink-wrapper/NVLinkOpts.td
@@ -47,9 +47,6 @@ def arch : Separate<["--", "-"], "arch">,
 def : Joined<["--", "-"], "plugin-opt=mcpu=">,
   Flags<[HelpHidden, WrapperOnlyOption]>, Alias;
 
-def feature : Separate<["--", "-"], "feature">, Flags<[WrapperOnlyOption]>,
-  HelpText<"Specify the '+ptx' freature to use for LTO.">;
-
 def g : Flag<["-"], "g">, HelpText<"Specify that this was a debug compile.">;
 def debug : Flag<["--"], "debug">, Alias;
 

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang][OpenMP] Do not use feature option during packaging (PR #111702)

2024-10-09 Thread Joseph Huber via cfe-commits

https://github.com/jhuber6 approved this pull request.

Thanks

https://github.com/llvm/llvm-project/pull/111702
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Cuda] Handle -fcuda-short-ptr even with -nocudalib (PR #111682)

2024-10-09 Thread Joseph Huber via cfe-commits

jhuber6 wrote:

> I'm not sure why we would ever want the current default if this is an option. 
> I'm trying to see it, but I can't work out a case where a 64bit pointer would 
> make sense, since the even tens-of-thousands of money supercomputer cards 
> have less than 256KiB of addressable shared memory.
> 
> It might be a bit of an intrusive change (albeit a relatively mechanical 
> one), but until we see a GPU come to market that has >4GiB addressable shared 
> memory, I think we should use the "short pointer" datalayout as default

It also applies to constant and private / local address spaces. I don't think 
those hit 4 GiB yet but it's more feasible than shared. Making address space 3 
32-bit by default would make sense to me.

https://github.com/llvm/llvm-project/pull/111682
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Cuda] Handle -fcuda-short-ptr even with -nocudalib (PR #111682)

2024-10-09 Thread Joseph Huber via cfe-commits

jhuber6 wrote:

> > Seems reasonable, which architectures require this? I know that NVIDIA 
> > deprecated the 32-bit `nvptx` target in CUDA 12 or something.
> 
> I'm not an expert on CUDA but, AFAICT, even in 64-bit CUDA, certain pointers 
> such as those pointing to shared memory are 32 bit, because the size of 
> shared memory is somewhere in the kB range. This generates better code, fewer 
> registers, etc. I'm not sure why the option isn't enabled by default, 
> personally - it seems like `nvcc` is doing this by default.
> 
> I was just playing with the option downstream and noticed this issue.

I figured it was something like that, since it saves a register per address. I 
don't know the history for why this isn't the default, it's pretty much just a 
data layout modifier to state that certain address spaces are 32-bit. Maybe 
@Artem-B or @jlebar can comment.

https://github.com/llvm/llvm-project/pull/111682
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Cuda] Handle -fcuda-short-ptr even with -nocudalib (PR #111682)

2024-10-09 Thread Joseph Huber via cfe-commits

https://github.com/jhuber6 approved this pull request.

Seems reasonable, which architectures require this?

https://github.com/llvm/llvm-project/pull/111682
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Cuda] Handle -fcuda-short-ptr even with -nocudalib (PR #111682)

2024-10-09 Thread Joseph Huber via cfe-commits

https://github.com/jhuber6 edited 
https://github.com/llvm/llvm-project/pull/111682
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang] Automatically enable `-fconvergent-functions` on GPU targets (PR #111076)

2024-10-04 Thread Joseph Huber via cfe-commits

https://github.com/jhuber6 closed 
https://github.com/llvm/llvm-project/pull/111076
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang] Automatically enable `-fconvergent-functions` on GPU targets (PR #111076)

2024-10-04 Thread Joseph Huber via cfe-commits

jhuber6 wrote:

> > -fno-convergent-functions to opt-out if you want to test broken behavior.
> 
> You may legitimately know there are no convergent functions in the TU. We 
> also have the noconvergent source attribute now for this

Think it would be useful to put that on functions in the wrapper headers that 
definitely aren't convergent? E.g. getting a thread id.

https://github.com/llvm/llvm-project/pull/111076
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang] Automatically enable `-fconvergent-functions` on GPU targets (PR #111076)

2024-10-04 Thread Joseph Huber via cfe-commits


@@ -4106,9 +4106,10 @@ bool CompilerInvocation::ParseLangArgs(LangOptions 
&Opts, ArgList &Args,
   Opts.Blocks = Args.hasArg(OPT_fblocks) || (Opts.OpenCL
 && Opts.OpenCLVersion == 200);
 
-  Opts.ConvergentFunctions = Args.hasArg(OPT_fconvergent_functions) ||
- Opts.OpenCL || (Opts.CUDA && Opts.CUDAIsDevice) ||
- Opts.SYCLIsDevice || Opts.HLSL;
+  Opts.ConvergentFunctions = Args.hasFlag(
+  OPT_fconvergent_functions, OPT_fno_convergent_functions,
+  Opts.OpenMPIsTargetDevice || T.isAMDGPU() || T.isNVPTX() || Opts.OpenCL 
||
+  Opts.CUDAIsDevice || Opts.SYCLIsDevice || Opts.HLSL);

jhuber6 wrote:

Done, I just put it in a variable for now.

https://github.com/llvm/llvm-project/pull/111076
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-10-04 Thread Joseph Huber via cfe-commits

jhuber6 wrote:

> IIRC, you discussed once to have GPU-agnostic intrinsics in LLVM-IR. The 
> backends then have to handle the details.

There's three approaches basically, wrapper header, builtins, and intrinsics. 
We could make some generic intrinsics but it would be a lot more work and 
duplicate a bunch of functions. I think the intrinsics were once suggested by 
@JonChesterfield, so maybe he could chime in. Even with this, it's still 
something we could provide in the future.

https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-10-03 Thread Joseph Huber via cfe-commits


@@ -0,0 +1,153 @@
+//===-- amdgpuintrin.h - AMDPGU intrinsic functions 
---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef __AMDGPUINTRIN_H
+#define __AMDGPUINTRIN_H
+
+#ifndef __AMDGPU__
+#error "This file is intended for AMDGPU targets or offloading to AMDGPU"
+#endif
+
+#include 
+#include 
+
+#if defined(__HIP__) || defined(__CUDA__)
+#define _DEFAULT_ATTRS __attribute__((device))
+#elif !defined(_DEFAULT_ATTRS)
+#define _DEFAULT_ATTRS
+#endif
+
+#pragma omp begin declare target device_type(nohost)
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+// Type aliases to the address spaces used by the AMDGPU backend.
+#define _Private __attribute__((opencl_private))
+#define _Constant __attribute__((opencl_constant))
+#define _Local __attribute__((opencl_local))
+#define _Global __attribute__((opencl_global))
+
+// Attribute to declare a function as a kernel.
+#define _Kernel __attribute__((amdgpu_kernel, visibility("protected")))

jhuber6 wrote:

The NVPTX target blatantly ignore visibility, so protected doesn't really make 
a huge difference. It does matter for LTO however which is why it's here. The 
GPU targets only ever go to an ELF target right now. We use protected for 
everything because this pretty much does exactly what we want, a symbol visible 
from the GPU ELF that can't be preempted so we don't need weird DSO things.

https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-10-03 Thread Joseph Huber via cfe-commits


@@ -0,0 +1,153 @@
+//===-- amdgpuintrin.h - AMDPGU intrinsic functions 
---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef __AMDGPUINTRIN_H
+#define __AMDGPUINTRIN_H
+
+#ifndef __AMDGPU__
+#error "This file is intended for AMDGPU targets or offloading to AMDGPU"
+#endif
+
+#include 
+#include 
+
+#if defined(__HIP__) || defined(__CUDA__)
+#define _DEFAULT_ATTRS __attribute__((device))
+#elif !defined(_DEFAULT_ATTRS)
+#define _DEFAULT_ATTRS
+#endif
+
+#pragma omp begin declare target device_type(nohost)
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+// Type aliases to the address spaces used by the AMDGPU backend.
+#define _Private __attribute__((opencl_private))
+#define _Constant __attribute__((opencl_constant))
+#define _Local __attribute__((opencl_local))
+#define _Global __attribute__((opencl_global))

jhuber6 wrote:

That looks very OpenCL, I guess C just wants to reserve any top level name 
since they like to do `_Thread_local` and stuff? I can change it.

https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-10-03 Thread Joseph Huber via cfe-commits

https://github.com/jhuber6 updated 
https://github.com/llvm/llvm-project/pull/110179

>From 4a3348e56950583fb28211879f5ab157c34cbc66 Mon Sep 17 00:00:00 2001
From: Joseph Huber 
Date: Thu, 26 Sep 2024 16:47:14 -0500
Subject: [PATCH 1/4] [Clang] Implement resource directory headers for common
 GPU intrinsics

Summary:
All GPU based languages provide some way to access things like the
thread ID or other resources. However, this is spread between many
different languages and it varies between targets. The goal here is to
provide a resource directory header that just provides these in an
easier to understand way, primarily so this can be used for C/C++ code.
The interface aims to be common, to faciliate easier porting, but target
specific stuff could be put in the individual headers.
---
 clang/lib/Headers/CMakeLists.txt |  14 +++
 clang/lib/Headers/amdgpuintrin.h | 187 +++
 clang/lib/Headers/gpuintrin.h|  18 +++
 clang/lib/Headers/nvptxintrin.h  | 184 ++
 4 files changed, 403 insertions(+)
 create mode 100644 clang/lib/Headers/amdgpuintrin.h
 create mode 100644 clang/lib/Headers/gpuintrin.h
 create mode 100644 clang/lib/Headers/nvptxintrin.h

diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index ff392e7122a448..a0e7ae67b7219a 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -268,6 +268,12 @@ set(x86_files
   cpuid.h
   )
 
+set(gpu_files
+  gpuintrin.h
+  nvptxintrin.h
+  amdgpuintrin.h
+  )
+
 set(windows_only_files
   intrin0.h
   intrin.h
@@ -296,6 +302,7 @@ set(files
   ${systemz_files}
   ${ve_files}
   ${x86_files}
+  ${gpu_files}
   ${webassembly_files}
   ${windows_only_files}
   ${utility_files}
@@ -518,6 +525,7 @@ add_header_target("systemz-resource-headers" 
"${systemz_files};${zos_wrapper_fil
 add_header_target("ve-resource-headers" "${ve_files}")
 add_header_target("webassembly-resource-headers" "${webassembly_files}")
 add_header_target("x86-resource-headers" "${x86_files}")
+add_header_target("gpu-resource-headers" "${gpu_files}")
 
 # Other header groupings
 add_header_target("hlsl-resource-headers" ${hlsl_files})
@@ -704,6 +712,12 @@ install(
   EXCLUDE_FROM_ALL
   COMPONENT x86-resource-headers)
 
+install(
+  FILES ${gpu_files}
+  DESTINATION ${header_install_dir}
+  EXCLUDE_FROM_ALL
+  COMPONENT gpu-resource-headers)
+
 if(NOT CLANG_ENABLE_HLSL)
   set(EXCLUDE_HLSL EXCLUDE_FROM_ALL)
 endif()
diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
new file mode 100644
index 00..95936f86bd15b8
--- /dev/null
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -0,0 +1,187 @@
+//===-- amdgpuintrin.h - AMDPGU intrinsic functions 
---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef __AMDGPUINTRIN_H
+#define __AMDGPUINTRIN_H
+
+#ifndef __AMDGPU__
+#error "This file is intended for AMDGPU targets or offloading to AMDGPU
+#endif
+
+#include 
+#include 
+
+#if defined(__HIP__) || defined(__CUDA__)
+#define _DEFAULT_ATTRS __attribute__((device)) __attribute__((always_inline))
+#else
+#define _DEFAULT_ATTRS __attribute__((always_inline))
+#endif
+
+#pragma omp begin declare target device_type(nohost)
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+// Type aliases to the address spaces used by the AMDGPU backend.
+#define _private __attribute__((opencl_private))
+#define _constant __attribute__((opencl_constant))
+#define _local __attribute__((opencl_local))
+#define _global __attribute__((opencl_global))
+
+// Attribute to declare a function as a kernel.
+#define _kernel __attribute__((amdgpu_kernel, visibility("protected")))
+
+// Returns the number of workgroups in the 'x' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_x() {
+  return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
+}
+
+// Returns the number of workgroups in the 'y' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_y() {
+  return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
+}
+
+// Returns the number of workgroups in the 'z' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_z() {
+  return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
+}
+
+// Returns the total number of workgruops in the grid.
+_DEFAULT_ATTRS static inline uint64_t _get_num_blocks() {
+  return _get_num_blocks_x() * _get_num_blocks_y() * _get_num_blocks_z();
+}
+
+// Returns the 'x' dimension of the current AMD workgroup's id.
+_DEFAULT_ATTRS static inline uint32_t _get_block_id_x() {
+  return __builtin_amdgcn_workgroup_id_x();
+}
+
+// Returns the 'y'

[clang] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-10-03 Thread Joseph Huber via cfe-commits


@@ -0,0 +1,184 @@
+//===-- nvptxintrin.h - NVPTX intrinsic functions 
-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef __NVPTXINTRIN_H
+#define __NVPTXINTRIN_H
+
+#ifndef __NVPTX__
+#error "This file is intended for NVPTX targets or offloading to NVPTX
+#endif
+
+#include 
+#include 
+
+#if defined(__HIP__) || defined(__CUDA__)
+#define _DEFAULT_ATTRS __attribute__((device)) __attribute__((always_inline))
+#else
+#define _DEFAULT_ATTRS __attribute__((always_inline))
+#endif
+
+#pragma omp begin declare target device_type(nohost)
+#pragma omp begin declare variant match(device = {arch(nvptx64)})
+
+// Type aliases to the address spaces used by the NVPTX backend.
+#define _private __attribute__((opencl_private))
+#define _constant __attribute__((opencl_constant))
+#define _local __attribute__((opencl_local))
+#define _global __attribute__((opencl_global))
+
+// Attribute to declare a function as a kernel.
+#define _kernel __attribute__((nvptx_kernel))
+
+// Returns the number of CUDA blocks in the 'x' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_x() {
+  return __nvvm_read_ptx_sreg_nctaid_x();
+}
+
+// Returns the number of CUDA blocks in the 'y' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_y() {
+  return __nvvm_read_ptx_sreg_nctaid_y();
+}
+
+// Returns the number of CUDA blocks in the 'z' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_z() {
+  return __nvvm_read_ptx_sreg_nctaid_z();
+}
+
+// Returns the total number of CUDA blocks.
+_DEFAULT_ATTRS static inline uint64_t _get_num_blocks() {
+  return _get_num_blocks_x() * _get_num_blocks_y() * _get_num_blocks_z();
+}
+
+// Returns the 'x' dimension of the current CUDA block's id.
+_DEFAULT_ATTRS static inline uint32_t _get_block_id_x() {
+  return __nvvm_read_ptx_sreg_ctaid_x();
+}
+
+// Returns the 'y' dimension of the current CUDA block's id.
+_DEFAULT_ATTRS static inline uint32_t _get_block_id_y() {
+  return __nvvm_read_ptx_sreg_ctaid_y();
+}
+
+// Returns the 'z' dimension of the current CUDA block's id.
+_DEFAULT_ATTRS static inline uint32_t _get_block_id_z() {
+  return __nvvm_read_ptx_sreg_ctaid_z();
+}
+
+// Returns the absolute id of the CUDA block.
+_DEFAULT_ATTRS static inline uint64_t _get_block_id() {
+  return _get_block_id_x() + _get_num_blocks_x() * _get_block_id_y() +
+ _get_num_blocks_x() * _get_num_blocks_y() * _get_block_id_z();
+}
+
+// Returns the number of CUDA threads in the 'x' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_threads_x() {
+  return __nvvm_read_ptx_sreg_ntid_x();
+}
+
+// Returns the number of CUDA threads in the 'y' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_threads_y() {
+  return __nvvm_read_ptx_sreg_ntid_y();
+}
+
+// Returns the number of CUDA threads in the 'z' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_threads_z() {
+  return __nvvm_read_ptx_sreg_ntid_z();
+}
+
+// Returns the total number of threads in the block.
+_DEFAULT_ATTRS static inline uint64_t _get_num_threads() {
+  return _get_num_threads_x() * _get_num_threads_y() * _get_num_threads_z();
+}
+
+// Returns the 'x' dimension id of the thread in the current CUDA block.
+_DEFAULT_ATTRS static inline uint32_t _get_thread_id_x() {
+  return __nvvm_read_ptx_sreg_tid_x();
+}
+
+// Returns the 'y' dimension id of the thread in the current CUDA block.
+_DEFAULT_ATTRS static inline uint32_t _get_thread_id_y() {
+  return __nvvm_read_ptx_sreg_tid_y();
+}
+
+// Returns the 'z' dimension id of the thread in the current CUDA block.
+_DEFAULT_ATTRS static inline uint32_t _get_thread_id_z() {
+  return __nvvm_read_ptx_sreg_tid_z();
+}
+
+// Returns the absolute id of the thread in the current CUDA block.
+_DEFAULT_ATTRS static inline uint64_t _get_thread_id() {
+  return _get_thread_id_x() + _get_num_threads_x() * _get_thread_id_y() +
+ _get_num_threads_x() * _get_num_threads_y() * _get_thread_id_z();
+}
+
+// Returns the size of a CUDA warp, always 32 on NVIDIA hardware.
+_DEFAULT_ATTRS static inline uint32_t _get_lane_size() { return 32; }
+
+// Returns the id of the thread inside of a CUDA warp executing together.
+_DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t _get_lane_id() {
+  return __nvvm_read_ptx_sreg_laneid();
+}
+
+// Returns the bit-mask of active threads in the current warp.
+_DEFAULT_ATTRS [[clang::convergent]] static inline uint64_t _get_lane_mask() {
+  return __nvvm_activemask();
+}
+
+// Copies the value from the first active thread in the warp to the rest.
+_DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t
+_broadcast_value(uint64_t lane_mask, uint32_t x) {
+  uint32_t mask = static_cast(lane_mask);
+  u

[clang] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-10-03 Thread Joseph Huber via cfe-commits


@@ -0,0 +1,187 @@
+//===-- amdgpuintrin.h - AMDPGU intrinsic functions 
---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef __AMDGPUINTRIN_H
+#define __AMDGPUINTRIN_H
+
+#ifndef __AMDGPU__
+#error "This file is intended for AMDGPU targets or offloading to AMDGPU
+#endif
+
+#include 
+#include 
+
+#if defined(__HIP__) || defined(__CUDA__)
+#define _DEFAULT_ATTRS __attribute__((device)) __attribute__((always_inline))
+#else
+#define _DEFAULT_ATTRS __attribute__((always_inline))
+#endif
+
+#pragma omp begin declare target device_type(nohost)
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+// Type aliases to the address spaces used by the AMDGPU backend.
+#define _private __attribute__((opencl_private))
+#define _constant __attribute__((opencl_constant))
+#define _local __attribute__((opencl_local))
+#define _global __attribute__((opencl_global))
+
+// Attribute to declare a function as a kernel.
+#define _kernel __attribute__((amdgpu_kernel, visibility("protected")))
+
+// Returns the number of workgroups in the 'x' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_x() {
+  return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
+}
+
+// Returns the number of workgroups in the 'y' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_y() {
+  return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
+}
+
+// Returns the number of workgroups in the 'z' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_z() {
+  return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
+}
+
+// Returns the total number of workgruops in the grid.
+_DEFAULT_ATTRS static inline uint64_t _get_num_blocks() {
+  return _get_num_blocks_x() * _get_num_blocks_y() * _get_num_blocks_z();
+}
+
+// Returns the 'x' dimension of the current AMD workgroup's id.
+_DEFAULT_ATTRS static inline uint32_t _get_block_id_x() {
+  return __builtin_amdgcn_workgroup_id_x();
+}
+
+// Returns the 'y' dimension of the current AMD workgroup's id.
+_DEFAULT_ATTRS static inline uint32_t _get_block_id_y() {
+  return __builtin_amdgcn_workgroup_id_y();
+}
+
+// Returns the 'z' dimension of the current AMD workgroup's id.
+_DEFAULT_ATTRS static inline uint32_t _get_block_id_z() {
+  return __builtin_amdgcn_workgroup_id_z();
+}
+
+// Returns the absolute id of the AMD workgroup.
+_DEFAULT_ATTRS static inline uint64_t _get_block_id() {
+  return _get_block_id_x() + _get_num_blocks_x() * _get_block_id_y() +
+ _get_num_blocks_x() * _get_num_blocks_y() * _get_block_id_z();
+}
+
+// Returns the number of workitems in the 'x' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_threads_x() {
+  return __builtin_amdgcn_workgroup_size_x();
+}
+
+// Returns the number of workitems in the 'y' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_threads_y() {
+  return __builtin_amdgcn_workgroup_size_y();
+}
+
+// Returns the number of workitems in the 'z' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_threads_z() {
+  return __builtin_amdgcn_workgroup_size_z();
+}
+
+// Returns the total number of workitems in the workgroup.
+_DEFAULT_ATTRS static inline uint64_t _get_num_threads() {
+  return _get_num_threads_x() * _get_num_threads_y() * _get_num_threads_z();
+}
+
+// Returns the 'x' dimension id of the workitem in the current AMD workgroup.
+_DEFAULT_ATTRS static inline uint32_t _get_thread_id_x() {
+  return __builtin_amdgcn_workitem_id_x();
+}
+
+// Returns the 'y' dimension id of the workitem in the current AMD workgroup.
+_DEFAULT_ATTRS static inline uint32_t _get_thread_id_y() {
+  return __builtin_amdgcn_workitem_id_y();
+}
+
+// Returns the 'z' dimension id of the workitem in the current AMD workgroup.
+_DEFAULT_ATTRS static inline uint32_t _get_thread_id_z() {
+  return __builtin_amdgcn_workitem_id_z();
+}
+
+// Returns the absolute id of the thread in the current AMD workgroup.
+_DEFAULT_ATTRS static inline uint64_t _get_thread_id() {
+  return _get_thread_id_x() + _get_num_threads_x() * _get_thread_id_y() +
+ _get_num_threads_x() * _get_num_threads_y() * _get_thread_id_z();
+}
+
+// Returns the size of an AMD wavefront, either 32 or 64 depending on hardware
+// and compilation options.
+_DEFAULT_ATTRS static inline uint32_t _get_lane_size() {
+  return __builtin_amdgcn_wavefrontsize();
+}
+
+// Returns the id of the thread inside of an AMD wavefront executing together.
+_DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t _get_lane_id() {
+  return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
+}
+
+// Returns the bit-mask of active threads in

[clang] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-10-03 Thread Joseph Huber via cfe-commits

https://github.com/jhuber6 ready_for_review 
https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-10-03 Thread Joseph Huber via cfe-commits

https://github.com/jhuber6 updated 
https://github.com/llvm/llvm-project/pull/110179

>From 4a3348e56950583fb28211879f5ab157c34cbc66 Mon Sep 17 00:00:00 2001
From: Joseph Huber 
Date: Thu, 26 Sep 2024 16:47:14 -0500
Subject: [PATCH 1/3] [Clang] Implement resource directory headers for common
 GPU intrinsics

Summary:
All GPU based languages provide some way to access things like the
thread ID or other resources. However, this is spread between many
different languages and it varies between targets. The goal here is to
provide a resource directory header that just provides these in an
easier to understand way, primarily so this can be used for C/C++ code.
The interface aims to be common, to faciliate easier porting, but target
specific stuff could be put in the individual headers.
---
 clang/lib/Headers/CMakeLists.txt |  14 +++
 clang/lib/Headers/amdgpuintrin.h | 187 +++
 clang/lib/Headers/gpuintrin.h|  18 +++
 clang/lib/Headers/nvptxintrin.h  | 184 ++
 4 files changed, 403 insertions(+)
 create mode 100644 clang/lib/Headers/amdgpuintrin.h
 create mode 100644 clang/lib/Headers/gpuintrin.h
 create mode 100644 clang/lib/Headers/nvptxintrin.h

diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index ff392e7122a448..a0e7ae67b7219a 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -268,6 +268,12 @@ set(x86_files
   cpuid.h
   )
 
+set(gpu_files
+  gpuintrin.h
+  nvptxintrin.h
+  amdgpuintrin.h
+  )
+
 set(windows_only_files
   intrin0.h
   intrin.h
@@ -296,6 +302,7 @@ set(files
   ${systemz_files}
   ${ve_files}
   ${x86_files}
+  ${gpu_files}
   ${webassembly_files}
   ${windows_only_files}
   ${utility_files}
@@ -518,6 +525,7 @@ add_header_target("systemz-resource-headers" 
"${systemz_files};${zos_wrapper_fil
 add_header_target("ve-resource-headers" "${ve_files}")
 add_header_target("webassembly-resource-headers" "${webassembly_files}")
 add_header_target("x86-resource-headers" "${x86_files}")
+add_header_target("gpu-resource-headers" "${gpu_files}")
 
 # Other header groupings
 add_header_target("hlsl-resource-headers" ${hlsl_files})
@@ -704,6 +712,12 @@ install(
   EXCLUDE_FROM_ALL
   COMPONENT x86-resource-headers)
 
+install(
+  FILES ${gpu_files}
+  DESTINATION ${header_install_dir}
+  EXCLUDE_FROM_ALL
+  COMPONENT gpu-resource-headers)
+
 if(NOT CLANG_ENABLE_HLSL)
   set(EXCLUDE_HLSL EXCLUDE_FROM_ALL)
 endif()
diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
new file mode 100644
index 00..95936f86bd15b8
--- /dev/null
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -0,0 +1,187 @@
+//===-- amdgpuintrin.h - AMDPGU intrinsic functions 
---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef __AMDGPUINTRIN_H
+#define __AMDGPUINTRIN_H
+
+#ifndef __AMDGPU__
+#error "This file is intended for AMDGPU targets or offloading to AMDGPU
+#endif
+
+#include 
+#include 
+
+#if defined(__HIP__) || defined(__CUDA__)
+#define _DEFAULT_ATTRS __attribute__((device)) __attribute__((always_inline))
+#else
+#define _DEFAULT_ATTRS __attribute__((always_inline))
+#endif
+
+#pragma omp begin declare target device_type(nohost)
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+// Type aliases to the address spaces used by the AMDGPU backend.
+#define _private __attribute__((opencl_private))
+#define _constant __attribute__((opencl_constant))
+#define _local __attribute__((opencl_local))
+#define _global __attribute__((opencl_global))
+
+// Attribute to declare a function as a kernel.
+#define _kernel __attribute__((amdgpu_kernel, visibility("protected")))
+
+// Returns the number of workgroups in the 'x' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_x() {
+  return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
+}
+
+// Returns the number of workgroups in the 'y' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_y() {
+  return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
+}
+
+// Returns the number of workgroups in the 'z' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_z() {
+  return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
+}
+
+// Returns the total number of workgruops in the grid.
+_DEFAULT_ATTRS static inline uint64_t _get_num_blocks() {
+  return _get_num_blocks_x() * _get_num_blocks_y() * _get_num_blocks_z();
+}
+
+// Returns the 'x' dimension of the current AMD workgroup's id.
+_DEFAULT_ATTRS static inline uint32_t _get_block_id_x() {
+  return __builtin_amdgcn_workgroup_id_x();
+}
+
+// Returns the 'y'

[clang] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-10-03 Thread Joseph Huber via cfe-commits

https://github.com/jhuber6 updated 
https://github.com/llvm/llvm-project/pull/110179

>From 4a3348e56950583fb28211879f5ab157c34cbc66 Mon Sep 17 00:00:00 2001
From: Joseph Huber 
Date: Thu, 26 Sep 2024 16:47:14 -0500
Subject: [PATCH 1/2] [Clang] Implement resource directory headers for common
 GPU intrinsics

Summary:
All GPU based languages provide some way to access things like the
thread ID or other resources. However, this is spread between many
different languages and it varies between targets. The goal here is to
provide a resource directory header that just provides these in an
easier to understand way, primarily so this can be used for C/C++ code.
The interface aims to be common, to faciliate easier porting, but target
specific stuff could be put in the individual headers.
---
 clang/lib/Headers/CMakeLists.txt |  14 +++
 clang/lib/Headers/amdgpuintrin.h | 187 +++
 clang/lib/Headers/gpuintrin.h|  18 +++
 clang/lib/Headers/nvptxintrin.h  | 184 ++
 4 files changed, 403 insertions(+)
 create mode 100644 clang/lib/Headers/amdgpuintrin.h
 create mode 100644 clang/lib/Headers/gpuintrin.h
 create mode 100644 clang/lib/Headers/nvptxintrin.h

diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index ff392e7122a448..a0e7ae67b7219a 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -268,6 +268,12 @@ set(x86_files
   cpuid.h
   )
 
+set(gpu_files
+  gpuintrin.h
+  nvptxintrin.h
+  amdgpuintrin.h
+  )
+
 set(windows_only_files
   intrin0.h
   intrin.h
@@ -296,6 +302,7 @@ set(files
   ${systemz_files}
   ${ve_files}
   ${x86_files}
+  ${gpu_files}
   ${webassembly_files}
   ${windows_only_files}
   ${utility_files}
@@ -518,6 +525,7 @@ add_header_target("systemz-resource-headers" 
"${systemz_files};${zos_wrapper_fil
 add_header_target("ve-resource-headers" "${ve_files}")
 add_header_target("webassembly-resource-headers" "${webassembly_files}")
 add_header_target("x86-resource-headers" "${x86_files}")
+add_header_target("gpu-resource-headers" "${gpu_files}")
 
 # Other header groupings
 add_header_target("hlsl-resource-headers" ${hlsl_files})
@@ -704,6 +712,12 @@ install(
   EXCLUDE_FROM_ALL
   COMPONENT x86-resource-headers)
 
+install(
+  FILES ${gpu_files}
+  DESTINATION ${header_install_dir}
+  EXCLUDE_FROM_ALL
+  COMPONENT gpu-resource-headers)
+
 if(NOT CLANG_ENABLE_HLSL)
   set(EXCLUDE_HLSL EXCLUDE_FROM_ALL)
 endif()
diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
new file mode 100644
index 00..95936f86bd15b8
--- /dev/null
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -0,0 +1,187 @@
+//===-- amdgpuintrin.h - AMDPGU intrinsic functions 
---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef __AMDGPUINTRIN_H
+#define __AMDGPUINTRIN_H
+
+#ifndef __AMDGPU__
+#error "This file is intended for AMDGPU targets or offloading to AMDGPU
+#endif
+
+#include 
+#include 
+
+#if defined(__HIP__) || defined(__CUDA__)
+#define _DEFAULT_ATTRS __attribute__((device)) __attribute__((always_inline))
+#else
+#define _DEFAULT_ATTRS __attribute__((always_inline))
+#endif
+
+#pragma omp begin declare target device_type(nohost)
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+// Type aliases to the address spaces used by the AMDGPU backend.
+#define _private __attribute__((opencl_private))
+#define _constant __attribute__((opencl_constant))
+#define _local __attribute__((opencl_local))
+#define _global __attribute__((opencl_global))
+
+// Attribute to declare a function as a kernel.
+#define _kernel __attribute__((amdgpu_kernel, visibility("protected")))
+
+// Returns the number of workgroups in the 'x' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_x() {
+  return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
+}
+
+// Returns the number of workgroups in the 'y' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_y() {
+  return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
+}
+
+// Returns the number of workgroups in the 'z' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_z() {
+  return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
+}
+
+// Returns the total number of workgruops in the grid.
+_DEFAULT_ATTRS static inline uint64_t _get_num_blocks() {
+  return _get_num_blocks_x() * _get_num_blocks_y() * _get_num_blocks_z();
+}
+
+// Returns the 'x' dimension of the current AMD workgroup's id.
+_DEFAULT_ATTRS static inline uint32_t _get_block_id_x() {
+  return __builtin_amdgcn_workgroup_id_x();
+}
+
+// Returns the 'y'

[clang] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-10-02 Thread Joseph Huber via cfe-commits


@@ -0,0 +1,187 @@
+//===-- amdgpuintrin.h - AMDPGU intrinsic functions 
---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef __AMDGPUINTRIN_H
+#define __AMDGPUINTRIN_H
+
+#ifndef __AMDGPU__
+#error "This file is intended for AMDGPU targets or offloading to AMDGPU
+#endif
+
+#include 
+#include 
+
+#if defined(__HIP__) || defined(__CUDA__)
+#define _DEFAULT_ATTRS __attribute__((device)) __attribute__((always_inline))
+#else
+#define _DEFAULT_ATTRS __attribute__((always_inline))
+#endif
+
+#pragma omp begin declare target device_type(nohost)
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+// Type aliases to the address spaces used by the AMDGPU backend.
+#define _private __attribute__((opencl_private))
+#define _constant __attribute__((opencl_constant))
+#define _local __attribute__((opencl_local))
+#define _global __attribute__((opencl_global))
+
+// Attribute to declare a function as a kernel.
+#define _kernel __attribute__((amdgpu_kernel, visibility("protected")))
+
+// Returns the number of workgroups in the 'x' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_x() {
+  return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
+}
+
+// Returns the number of workgroups in the 'y' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_y() {
+  return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
+}
+
+// Returns the number of workgroups in the 'z' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_z() {
+  return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
+}
+
+// Returns the total number of workgruops in the grid.
+_DEFAULT_ATTRS static inline uint64_t _get_num_blocks() {
+  return _get_num_blocks_x() * _get_num_blocks_y() * _get_num_blocks_z();
+}
+
+// Returns the 'x' dimension of the current AMD workgroup's id.
+_DEFAULT_ATTRS static inline uint32_t _get_block_id_x() {
+  return __builtin_amdgcn_workgroup_id_x();
+}
+
+// Returns the 'y' dimension of the current AMD workgroup's id.
+_DEFAULT_ATTRS static inline uint32_t _get_block_id_y() {
+  return __builtin_amdgcn_workgroup_id_y();
+}
+
+// Returns the 'z' dimension of the current AMD workgroup's id.
+_DEFAULT_ATTRS static inline uint32_t _get_block_id_z() {
+  return __builtin_amdgcn_workgroup_id_z();
+}
+
+// Returns the absolute id of the AMD workgroup.
+_DEFAULT_ATTRS static inline uint64_t _get_block_id() {
+  return _get_block_id_x() + _get_num_blocks_x() * _get_block_id_y() +
+ _get_num_blocks_x() * _get_num_blocks_y() * _get_block_id_z();
+}
+
+// Returns the number of workitems in the 'x' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_threads_x() {
+  return __builtin_amdgcn_workgroup_size_x();
+}
+
+// Returns the number of workitems in the 'y' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_threads_y() {
+  return __builtin_amdgcn_workgroup_size_y();
+}
+
+// Returns the number of workitems in the 'z' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_threads_z() {
+  return __builtin_amdgcn_workgroup_size_z();
+}
+
+// Returns the total number of workitems in the workgroup.
+_DEFAULT_ATTRS static inline uint64_t _get_num_threads() {
+  return _get_num_threads_x() * _get_num_threads_y() * _get_num_threads_z();
+}
+
+// Returns the 'x' dimension id of the workitem in the current AMD workgroup.
+_DEFAULT_ATTRS static inline uint32_t _get_thread_id_x() {
+  return __builtin_amdgcn_workitem_id_x();
+}
+
+// Returns the 'y' dimension id of the workitem in the current AMD workgroup.
+_DEFAULT_ATTRS static inline uint32_t _get_thread_id_y() {
+  return __builtin_amdgcn_workitem_id_y();
+}
+
+// Returns the 'z' dimension id of the workitem in the current AMD workgroup.
+_DEFAULT_ATTRS static inline uint32_t _get_thread_id_z() {
+  return __builtin_amdgcn_workitem_id_z();
+}
+
+// Returns the absolute id of the thread in the current AMD workgroup.
+_DEFAULT_ATTRS static inline uint64_t _get_thread_id() {
+  return _get_thread_id_x() + _get_num_threads_x() * _get_thread_id_y() +
+ _get_num_threads_x() * _get_num_threads_y() * _get_thread_id_z();
+}
+
+// Returns the size of an AMD wavefront, either 32 or 64 depending on hardware
+// and compilation options.
+_DEFAULT_ATTRS static inline uint32_t _get_lane_size() {
+  return __builtin_amdgcn_wavefrontsize();
+}
+
+// Returns the id of the thread inside of an AMD wavefront executing together.
+_DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t _get_lane_id() {
+  return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
+}
+
+// Returns the bit-mask of active threads in

[clang] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-10-02 Thread Joseph Huber via cfe-commits


@@ -0,0 +1,187 @@
+//===-- amdgpuintrin.h - AMDPGU intrinsic functions 
---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef __AMDGPUINTRIN_H
+#define __AMDGPUINTRIN_H
+
+#ifndef __AMDGPU__
+#error "This file is intended for AMDGPU targets or offloading to AMDGPU
+#endif
+
+#include 
+#include 
+
+#if defined(__HIP__) || defined(__CUDA__)
+#define _DEFAULT_ATTRS __attribute__((device)) __attribute__((always_inline))
+#else
+#define _DEFAULT_ATTRS __attribute__((always_inline))
+#endif
+
+#pragma omp begin declare target device_type(nohost)
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+// Type aliases to the address spaces used by the AMDGPU backend.
+#define _private __attribute__((opencl_private))
+#define _constant __attribute__((opencl_constant))
+#define _local __attribute__((opencl_local))
+#define _global __attribute__((opencl_global))
+
+// Attribute to declare a function as a kernel.
+#define _kernel __attribute__((amdgpu_kernel, visibility("protected")))
+
+// Returns the number of workgroups in the 'x' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_x() {
+  return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
+}
+
+// Returns the number of workgroups in the 'y' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_y() {
+  return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
+}
+
+// Returns the number of workgroups in the 'z' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_z() {
+  return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
+}
+
+// Returns the total number of workgruops in the grid.
+_DEFAULT_ATTRS static inline uint64_t _get_num_blocks() {
+  return _get_num_blocks_x() * _get_num_blocks_y() * _get_num_blocks_z();
+}
+
+// Returns the 'x' dimension of the current AMD workgroup's id.
+_DEFAULT_ATTRS static inline uint32_t _get_block_id_x() {
+  return __builtin_amdgcn_workgroup_id_x();
+}
+
+// Returns the 'y' dimension of the current AMD workgroup's id.
+_DEFAULT_ATTRS static inline uint32_t _get_block_id_y() {
+  return __builtin_amdgcn_workgroup_id_y();
+}
+
+// Returns the 'z' dimension of the current AMD workgroup's id.
+_DEFAULT_ATTRS static inline uint32_t _get_block_id_z() {
+  return __builtin_amdgcn_workgroup_id_z();
+}
+
+// Returns the absolute id of the AMD workgroup.
+_DEFAULT_ATTRS static inline uint64_t _get_block_id() {
+  return _get_block_id_x() + _get_num_blocks_x() * _get_block_id_y() +
+ _get_num_blocks_x() * _get_num_blocks_y() * _get_block_id_z();
+}
+
+// Returns the number of workitems in the 'x' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_threads_x() {
+  return __builtin_amdgcn_workgroup_size_x();
+}
+
+// Returns the number of workitems in the 'y' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_threads_y() {
+  return __builtin_amdgcn_workgroup_size_y();
+}
+
+// Returns the number of workitems in the 'z' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_threads_z() {
+  return __builtin_amdgcn_workgroup_size_z();
+}
+
+// Returns the total number of workitems in the workgroup.
+_DEFAULT_ATTRS static inline uint64_t _get_num_threads() {
+  return _get_num_threads_x() * _get_num_threads_y() * _get_num_threads_z();
+}
+
+// Returns the 'x' dimension id of the workitem in the current AMD workgroup.
+_DEFAULT_ATTRS static inline uint32_t _get_thread_id_x() {
+  return __builtin_amdgcn_workitem_id_x();
+}
+
+// Returns the 'y' dimension id of the workitem in the current AMD workgroup.
+_DEFAULT_ATTRS static inline uint32_t _get_thread_id_y() {
+  return __builtin_amdgcn_workitem_id_y();
+}
+
+// Returns the 'z' dimension id of the workitem in the current AMD workgroup.
+_DEFAULT_ATTRS static inline uint32_t _get_thread_id_z() {
+  return __builtin_amdgcn_workitem_id_z();
+}
+
+// Returns the absolute id of the thread in the current AMD workgroup.
+_DEFAULT_ATTRS static inline uint64_t _get_thread_id() {
+  return _get_thread_id_x() + _get_num_threads_x() * _get_thread_id_y() +
+ _get_num_threads_x() * _get_num_threads_y() * _get_thread_id_z();
+}
+
+// Returns the size of an AMD wavefront, either 32 or 64 depending on hardware
+// and compilation options.
+_DEFAULT_ATTRS static inline uint32_t _get_lane_size() {
+  return __builtin_amdgcn_wavefrontsize();
+}
+
+// Returns the id of the thread inside of an AMD wavefront executing together.
+_DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t _get_lane_id() {

jhuber6 wrote:

I also need to fix `-fno-convergent-functions` and then pass it by default for 
GPU architectur

[clang] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-10-02 Thread Joseph Huber via cfe-commits


@@ -0,0 +1,187 @@
+//===-- amdgpuintrin.h - AMDPGU intrinsic functions 
---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef __AMDGPUINTRIN_H
+#define __AMDGPUINTRIN_H
+
+#ifndef __AMDGPU__
+#error "This file is intended for AMDGPU targets or offloading to AMDGPU
+#endif
+
+#include 
+#include 
+
+#if defined(__HIP__) || defined(__CUDA__)
+#define _DEFAULT_ATTRS __attribute__((device)) __attribute__((always_inline))
+#else
+#define _DEFAULT_ATTRS __attribute__((always_inline))
+#endif
+
+#pragma omp begin declare target device_type(nohost)
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+// Type aliases to the address spaces used by the AMDGPU backend.
+#define _private __attribute__((opencl_private))
+#define _constant __attribute__((opencl_constant))
+#define _local __attribute__((opencl_local))
+#define _global __attribute__((opencl_global))
+
+// Attribute to declare a function as a kernel.
+#define _kernel __attribute__((amdgpu_kernel, visibility("protected")))
+
+// Returns the number of workgroups in the 'x' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_x() {
+  return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
+}
+
+// Returns the number of workgroups in the 'y' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_y() {
+  return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
+}
+
+// Returns the number of workgroups in the 'z' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_z() {
+  return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
+}
+
+// Returns the total number of workgruops in the grid.
+_DEFAULT_ATTRS static inline uint64_t _get_num_blocks() {
+  return _get_num_blocks_x() * _get_num_blocks_y() * _get_num_blocks_z();
+}
+
+// Returns the 'x' dimension of the current AMD workgroup's id.
+_DEFAULT_ATTRS static inline uint32_t _get_block_id_x() {
+  return __builtin_amdgcn_workgroup_id_x();
+}
+
+// Returns the 'y' dimension of the current AMD workgroup's id.
+_DEFAULT_ATTRS static inline uint32_t _get_block_id_y() {
+  return __builtin_amdgcn_workgroup_id_y();
+}
+
+// Returns the 'z' dimension of the current AMD workgroup's id.
+_DEFAULT_ATTRS static inline uint32_t _get_block_id_z() {
+  return __builtin_amdgcn_workgroup_id_z();
+}
+
+// Returns the absolute id of the AMD workgroup.
+_DEFAULT_ATTRS static inline uint64_t _get_block_id() {
+  return _get_block_id_x() + _get_num_blocks_x() * _get_block_id_y() +
+ _get_num_blocks_x() * _get_num_blocks_y() * _get_block_id_z();
+}
+
+// Returns the number of workitems in the 'x' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_threads_x() {
+  return __builtin_amdgcn_workgroup_size_x();
+}
+
+// Returns the number of workitems in the 'y' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_threads_y() {
+  return __builtin_amdgcn_workgroup_size_y();
+}
+
+// Returns the number of workitems in the 'z' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_threads_z() {
+  return __builtin_amdgcn_workgroup_size_z();
+}
+
+// Returns the total number of workitems in the workgroup.
+_DEFAULT_ATTRS static inline uint64_t _get_num_threads() {
+  return _get_num_threads_x() * _get_num_threads_y() * _get_num_threads_z();
+}
+
+// Returns the 'x' dimension id of the workitem in the current AMD workgroup.
+_DEFAULT_ATTRS static inline uint32_t _get_thread_id_x() {
+  return __builtin_amdgcn_workitem_id_x();
+}
+
+// Returns the 'y' dimension id of the workitem in the current AMD workgroup.
+_DEFAULT_ATTRS static inline uint32_t _get_thread_id_y() {
+  return __builtin_amdgcn_workitem_id_y();
+}
+
+// Returns the 'z' dimension id of the workitem in the current AMD workgroup.
+_DEFAULT_ATTRS static inline uint32_t _get_thread_id_z() {
+  return __builtin_amdgcn_workitem_id_z();
+}
+
+// Returns the absolute id of the thread in the current AMD workgroup.
+_DEFAULT_ATTRS static inline uint64_t _get_thread_id() {
+  return _get_thread_id_x() + _get_num_threads_x() * _get_thread_id_y() +
+ _get_num_threads_x() * _get_num_threads_y() * _get_thread_id_z();
+}
+
+// Returns the size of an AMD wavefront, either 32 or 64 depending on hardware
+// and compilation options.
+_DEFAULT_ATTRS static inline uint32_t _get_lane_size() {
+  return __builtin_amdgcn_wavefrontsize();
+}
+
+// Returns the id of the thread inside of an AMD wavefront executing together.
+_DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t _get_lane_id() {

jhuber6 wrote:

I consider this more of a documentation thing honestly, it's useless to the 
compiler but lets 

[clang] [compiler-rt] [llvm] [openmp] [PGO][Offload] Add GPU profiling flags to driver (PR #94268)

2024-10-02 Thread Joseph Huber via cfe-commits


@@ -0,0 +1,82 @@
+// RUN: %libomptarget-compile-generic -fprofile-generate-gpu

jhuber6 wrote:

This is a limitation of the PTX target, globals cannot reference themselves. 
Most likely whatever NVIDIA engineer wrote the PTX parser found it annoying to 
reference something that wasn't fully parsed yet so he just decided to make it 
an error and here we are. See https://godbolt.org/z/53PP5c5ve.

https://github.com/llvm/llvm-project/pull/94268
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-09-27 Thread Joseph Huber via cfe-commits

jhuber6 wrote:

I am wondering if it would be easier to provide generic builtins in clang and 
just codegen them. I guess in that case we'd just upscale everything to 64-bit 
and say "If you need the other one use the target specific version".

https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-09-27 Thread Joseph Huber via cfe-commits


@@ -0,0 +1,184 @@
+//===-- nvptxintrin.h - NVPTX intrinsic functions 
-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef __NVPTXINTRIN_H
+#define __NVPTXINTRIN_H
+
+#ifndef __NVPTX__
+#error "This file is intended for NVPTX targets or offloading to NVPTX
+#endif
+
+#include 
+#include 
+
+#if defined(__HIP__) || defined(__CUDA__)
+#define _DEFAULT_ATTRS __attribute__((device)) __attribute__((always_inline))
+#else
+#define _DEFAULT_ATTRS __attribute__((always_inline))
+#endif
+
+#pragma omp begin declare target device_type(nohost)
+#pragma omp begin declare variant match(device = {arch(nvptx64)})
+
+// Type aliases to the address spaces used by the NVPTX backend.
+#define _private __attribute__((opencl_private))
+#define _constant __attribute__((opencl_constant))
+#define _local __attribute__((opencl_local))
+#define _global __attribute__((opencl_global))
+
+// Attribute to declare a function as a kernel.
+#define _kernel __attribute__((nvptx_kernel))
+
+// Returns the number of CUDA blocks in the 'x' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_x() {
+  return __nvvm_read_ptx_sreg_nctaid_x();
+}
+
+// Returns the number of CUDA blocks in the 'y' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_y() {
+  return __nvvm_read_ptx_sreg_nctaid_y();
+}
+
+// Returns the number of CUDA blocks in the 'z' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_z() {
+  return __nvvm_read_ptx_sreg_nctaid_z();
+}
+
+// Returns the total number of CUDA blocks.
+_DEFAULT_ATTRS static inline uint64_t _get_num_blocks() {
+  return _get_num_blocks_x() * _get_num_blocks_y() * _get_num_blocks_z();
+}
+
+// Returns the 'x' dimension of the current CUDA block's id.
+_DEFAULT_ATTRS static inline uint32_t _get_block_id_x() {
+  return __nvvm_read_ptx_sreg_ctaid_x();
+}
+
+// Returns the 'y' dimension of the current CUDA block's id.
+_DEFAULT_ATTRS static inline uint32_t _get_block_id_y() {
+  return __nvvm_read_ptx_sreg_ctaid_y();
+}
+
+// Returns the 'z' dimension of the current CUDA block's id.
+_DEFAULT_ATTRS static inline uint32_t _get_block_id_z() {
+  return __nvvm_read_ptx_sreg_ctaid_z();
+}
+
+// Returns the absolute id of the CUDA block.
+_DEFAULT_ATTRS static inline uint64_t _get_block_id() {
+  return _get_block_id_x() + _get_num_blocks_x() * _get_block_id_y() +
+ _get_num_blocks_x() * _get_num_blocks_y() * _get_block_id_z();
+}
+
+// Returns the number of CUDA threads in the 'x' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_threads_x() {
+  return __nvvm_read_ptx_sreg_ntid_x();
+}
+
+// Returns the number of CUDA threads in the 'y' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_threads_y() {
+  return __nvvm_read_ptx_sreg_ntid_y();
+}
+
+// Returns the number of CUDA threads in the 'z' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_threads_z() {
+  return __nvvm_read_ptx_sreg_ntid_z();
+}
+
+// Returns the total number of threads in the block.
+_DEFAULT_ATTRS static inline uint64_t _get_num_threads() {
+  return _get_num_threads_x() * _get_num_threads_y() * _get_num_threads_z();
+}
+
+// Returns the 'x' dimension id of the thread in the current CUDA block.
+_DEFAULT_ATTRS static inline uint32_t _get_thread_id_x() {
+  return __nvvm_read_ptx_sreg_tid_x();
+}
+
+// Returns the 'y' dimension id of the thread in the current CUDA block.
+_DEFAULT_ATTRS static inline uint32_t _get_thread_id_y() {
+  return __nvvm_read_ptx_sreg_tid_y();
+}
+
+// Returns the 'z' dimension id of the thread in the current CUDA block.
+_DEFAULT_ATTRS static inline uint32_t _get_thread_id_z() {
+  return __nvvm_read_ptx_sreg_tid_z();
+}
+
+// Returns the absolute id of the thread in the current CUDA block.
+_DEFAULT_ATTRS static inline uint64_t _get_thread_id() {
+  return _get_thread_id_x() + _get_num_threads_x() * _get_thread_id_y() +
+ _get_num_threads_x() * _get_num_threads_y() * _get_thread_id_z();
+}
+
+// Returns the size of a CUDA warp, always 32 on NVIDIA hardware.
+_DEFAULT_ATTRS static inline uint32_t _get_lane_size() { return 32; }

jhuber6 wrote:

Well, as of 5 seconds ago you can now use `__nvvm_read_ptx_sreg_warpsize`.

https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [NVPTX] Add a clang builtin for the `warpsize` intrinsic (PR #110316)

2024-09-27 Thread Joseph Huber via cfe-commits

https://github.com/jhuber6 closed 
https://github.com/llvm/llvm-project/pull/110316
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-09-27 Thread Joseph Huber via cfe-commits


@@ -0,0 +1,187 @@
+//===-- amdgpuintrin.h - AMDPGU intrinsic functions 
---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef __AMDGPUINTRIN_H
+#define __AMDGPUINTRIN_H
+
+#ifndef __AMDGPU__
+#error "This file is intended for AMDGPU targets or offloading to AMDGPU
+#endif
+
+#include 
+#include 
+
+#if defined(__HIP__) || defined(__CUDA__)
+#define _DEFAULT_ATTRS __attribute__((device)) __attribute__((always_inline))
+#else
+#define _DEFAULT_ATTRS __attribute__((always_inline))
+#endif
+
+#pragma omp begin declare target device_type(nohost)

jhuber6 wrote:

These pragmas are just ignored without the OpenMP language enabled.

https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [NVPTX] Add a clang builtin for the `warpsize` intrinsic (PR #110316)

2024-09-27 Thread Joseph Huber via cfe-commits

https://github.com/jhuber6 created 
https://github.com/llvm/llvm-project/pull/110316

Summary:
There's an intrinsic for the warp size, we want to expose this to make
the interface proposed in
https://github.com/llvm/llvm-project/pull/110179 more generic.


>From 63d45843ee15c940680e4d6a3ea87138ebfc5b69 Mon Sep 17 00:00:00 2001
From: Joseph Huber 
Date: Fri, 27 Sep 2024 14:08:51 -0500
Subject: [PATCH] [NVPTX] Add a clang builtin for the `warpsize` intrinsic

Summary:
There's an intrinsic for the warp size, we want to expose this to make
the interface proposed in
https://github.com/llvm/llvm-project/pull/110179 more generic.
---
 clang/include/clang/Basic/BuiltinsNVPTX.def | 1 +
 clang/test/CodeGen/builtins-nvptx.c | 4 +++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def 
b/clang/include/clang/Basic/BuiltinsNVPTX.def
index 6fff562165080a..6b7bce5bc00d4f 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.def
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.def
@@ -139,6 +139,7 @@ TARGET_BUILTIN(__nvvm_is_explicit_cluster, "b", "nc", 
AND(SM_90, PTX78))
 BUILTIN(__nvvm_read_ptx_sreg_laneid, "i", "nc")
 BUILTIN(__nvvm_read_ptx_sreg_warpid, "i", "nc")
 BUILTIN(__nvvm_read_ptx_sreg_nwarpid, "i", "nc")
+BUILTIN(__nvvm_read_ptx_sreg_warpsize, "i", "nc")
 
 BUILTIN(__nvvm_read_ptx_sreg_smid, "i", "nc")
 BUILTIN(__nvvm_read_ptx_sreg_nsmid, "i", "nc")
diff --git a/clang/test/CodeGen/builtins-nvptx.c 
b/clang/test/CodeGen/builtins-nvptx.c
index bfa72e8bd69454..0d0e3ecdb90c9e 100644
--- a/clang/test/CodeGen/builtins-nvptx.c
+++ b/clang/test/CodeGen/builtins-nvptx.c
@@ -114,6 +114,7 @@ __device__ int read_ids() {
 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.smid()
 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nsmid()
 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.gridid()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
 
   int a = __nvvm_read_ptx_sreg_laneid();
   int b = __nvvm_read_ptx_sreg_warpid();
@@ -121,8 +122,9 @@ __device__ int read_ids() {
   int d = __nvvm_read_ptx_sreg_smid();
   int e = __nvvm_read_ptx_sreg_nsmid();
   int f = __nvvm_read_ptx_sreg_gridid();
+  int g = __nvvm_read_ptx_sreg_warpsize();
 
-  return a + b + c + d + e + f;
+  return a + b + c + d + e + f + g;
 
 }
 

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-09-27 Thread Joseph Huber via cfe-commits


@@ -0,0 +1,184 @@
+//===-- nvptxintrin.h - NVPTX intrinsic functions 
-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef __NVPTXINTRIN_H
+#define __NVPTXINTRIN_H
+
+#ifndef __NVPTX__
+#error "This file is intended for NVPTX targets or offloading to NVPTX
+#endif
+
+#include 
+#include 
+
+#if defined(__HIP__) || defined(__CUDA__)
+#define _DEFAULT_ATTRS __attribute__((device)) __attribute__((always_inline))
+#else
+#define _DEFAULT_ATTRS __attribute__((always_inline))
+#endif
+
+#pragma omp begin declare target device_type(nohost)
+#pragma omp begin declare variant match(device = {arch(nvptx64)})
+
+// Type aliases to the address spaces used by the NVPTX backend.
+#define _private __attribute__((opencl_private))
+#define _constant __attribute__((opencl_constant))
+#define _local __attribute__((opencl_local))
+#define _global __attribute__((opencl_global))
+
+// Attribute to declare a function as a kernel.
+#define _kernel __attribute__((nvptx_kernel))
+
+// Returns the number of CUDA blocks in the 'x' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_x() {
+  return __nvvm_read_ptx_sreg_nctaid_x();
+}
+
+// Returns the number of CUDA blocks in the 'y' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_y() {
+  return __nvvm_read_ptx_sreg_nctaid_y();
+}
+
+// Returns the number of CUDA blocks in the 'z' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_z() {
+  return __nvvm_read_ptx_sreg_nctaid_z();
+}
+
+// Returns the total number of CUDA blocks.
+_DEFAULT_ATTRS static inline uint64_t _get_num_blocks() {
+  return _get_num_blocks_x() * _get_num_blocks_y() * _get_num_blocks_z();
+}
+
+// Returns the 'x' dimension of the current CUDA block's id.
+_DEFAULT_ATTRS static inline uint32_t _get_block_id_x() {
+  return __nvvm_read_ptx_sreg_ctaid_x();
+}
+
+// Returns the 'y' dimension of the current CUDA block's id.
+_DEFAULT_ATTRS static inline uint32_t _get_block_id_y() {
+  return __nvvm_read_ptx_sreg_ctaid_y();
+}
+
+// Returns the 'z' dimension of the current CUDA block's id.
+_DEFAULT_ATTRS static inline uint32_t _get_block_id_z() {
+  return __nvvm_read_ptx_sreg_ctaid_z();
+}
+
+// Returns the absolute id of the CUDA block.
+_DEFAULT_ATTRS static inline uint64_t _get_block_id() {
+  return _get_block_id_x() + _get_num_blocks_x() * _get_block_id_y() +
+ _get_num_blocks_x() * _get_num_blocks_y() * _get_block_id_z();
+}
+
+// Returns the number of CUDA threads in the 'x' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_threads_x() {
+  return __nvvm_read_ptx_sreg_ntid_x();
+}
+
+// Returns the number of CUDA threads in the 'y' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_threads_y() {
+  return __nvvm_read_ptx_sreg_ntid_y();
+}
+
+// Returns the number of CUDA threads in the 'z' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_threads_z() {
+  return __nvvm_read_ptx_sreg_ntid_z();
+}
+
+// Returns the total number of threads in the block.
+_DEFAULT_ATTRS static inline uint64_t _get_num_threads() {
+  return _get_num_threads_x() * _get_num_threads_y() * _get_num_threads_z();
+}
+
+// Returns the 'x' dimension id of the thread in the current CUDA block.
+_DEFAULT_ATTRS static inline uint32_t _get_thread_id_x() {
+  return __nvvm_read_ptx_sreg_tid_x();
+}
+
+// Returns the 'y' dimension id of the thread in the current CUDA block.
+_DEFAULT_ATTRS static inline uint32_t _get_thread_id_y() {
+  return __nvvm_read_ptx_sreg_tid_y();
+}
+
+// Returns the 'z' dimension id of the thread in the current CUDA block.
+_DEFAULT_ATTRS static inline uint32_t _get_thread_id_z() {
+  return __nvvm_read_ptx_sreg_tid_z();
+}
+
+// Returns the absolute id of the thread in the current CUDA block.
+_DEFAULT_ATTRS static inline uint64_t _get_thread_id() {
+  return _get_thread_id_x() + _get_num_threads_x() * _get_thread_id_y() +
+ _get_num_threads_x() * _get_num_threads_y() * _get_thread_id_z();
+}
+
+// Returns the size of a CUDA warp, always 32 on NVIDIA hardware.
+_DEFAULT_ATTRS static inline uint32_t _get_lane_size() { return 32; }
+
+// Returns the id of the thread inside of a CUDA warp executing together.
+_DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t _get_lane_id() {
+  return __nvvm_read_ptx_sreg_laneid();
+}
+
+// Returns the bit-mask of active threads in the current warp.
+_DEFAULT_ATTRS [[clang::convergent]] static inline uint64_t _get_lane_mask() {
+  return __nvvm_activemask();
+}
+
+// Copies the value from the first active thread in the warp to the rest.
+_DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t
+_broadcast_value(uint64_t lane_mask, uint32_t x) {
+  uint32_t mask = static_cast(lane_mask);
+  u

[clang] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-09-27 Thread Joseph Huber via cfe-commits


@@ -0,0 +1,187 @@
+//===-- amdgpuintrin.h - AMDPGU intrinsic functions 
---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef __AMDGPUINTRIN_H
+#define __AMDGPUINTRIN_H
+
+#ifndef __AMDGPU__
+#error "This file is intended for AMDGPU targets or offloading to AMDGPU
+#endif
+
+#include 
+#include 
+
+#if defined(__HIP__) || defined(__CUDA__)
+#define _DEFAULT_ATTRS __attribute__((device)) __attribute__((always_inline))
+#else
+#define _DEFAULT_ATTRS __attribute__((always_inline))
+#endif
+
+#pragma omp begin declare target device_type(nohost)
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+// Type aliases to the address spaces used by the AMDGPU backend.
+#define _private __attribute__((opencl_private))
+#define _constant __attribute__((opencl_constant))
+#define _local __attribute__((opencl_local))
+#define _global __attribute__((opencl_global))

jhuber6 wrote:

Yeah, I wasn't sure whether or not it's required to have protected names (could 
do _Local or __local). I'd probably prefer the former since it's what C uses 
for its special types.

https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-09-27 Thread Joseph Huber via cfe-commits


@@ -0,0 +1,187 @@
+//===-- amdgpuintrin.h - AMDPGU intrinsic functions 
---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef __AMDGPUINTRIN_H
+#define __AMDGPUINTRIN_H
+
+#ifndef __AMDGPU__
+#error "This file is intended for AMDGPU targets or offloading to AMDGPU
+#endif
+
+#include 
+#include 
+
+#if defined(__HIP__) || defined(__CUDA__)
+#define _DEFAULT_ATTRS __attribute__((device)) __attribute__((always_inline))
+#else
+#define _DEFAULT_ATTRS __attribute__((always_inline))
+#endif
+
+#pragma omp begin declare target device_type(nohost)
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+// Type aliases to the address spaces used by the AMDGPU backend.
+#define _private __attribute__((opencl_private))
+#define _constant __attribute__((opencl_constant))
+#define _local __attribute__((opencl_local))
+#define _global __attribute__((opencl_global))
+
+// Attribute to declare a function as a kernel.
+#define _kernel __attribute__((amdgpu_kernel, visibility("protected")))
+
+// Returns the number of workgroups in the 'x' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_x() {
+  return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
+}
+
+// Returns the number of workgroups in the 'y' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_y() {
+  return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
+}
+
+// Returns the number of workgroups in the 'z' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_z() {
+  return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
+}
+
+// Returns the total number of workgruops in the grid.
+_DEFAULT_ATTRS static inline uint64_t _get_num_blocks() {

jhuber6 wrote:

This is being conservative with the API as listed, AFAIK CUDA says that the 
maximum X block size is 2^31 - 1 and Y and Z are 2^16 -1, meaning if you get 
the "global" ID, it could technically be more than 2^32 - 1. Maybe @Artem-B 
could chime in here.

https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-09-27 Thread Joseph Huber via cfe-commits


@@ -0,0 +1,187 @@
+//===-- amdgpuintrin.h - AMDPGU intrinsic functions 
---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef __AMDGPUINTRIN_H
+#define __AMDGPUINTRIN_H
+
+#ifndef __AMDGPU__
+#error "This file is intended for AMDGPU targets or offloading to AMDGPU
+#endif
+
+#include 
+#include 
+
+#if defined(__HIP__) || defined(__CUDA__)
+#define _DEFAULT_ATTRS __attribute__((device)) __attribute__((always_inline))
+#else
+#define _DEFAULT_ATTRS __attribute__((always_inline))
+#endif
+
+#pragma omp begin declare target device_type(nohost)
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+// Type aliases to the address spaces used by the AMDGPU backend.
+#define _private __attribute__((opencl_private))
+#define _constant __attribute__((opencl_constant))
+#define _local __attribute__((opencl_local))
+#define _global __attribute__((opencl_global))
+
+// Attribute to declare a function as a kernel.
+#define _kernel __attribute__((amdgpu_kernel, visibility("protected")))
+
+// Returns the number of workgroups in the 'x' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_x() {
+  return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
+}
+
+// Returns the number of workgroups in the 'y' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_y() {
+  return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
+}
+
+// Returns the number of workgroups in the 'z' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_z() {
+  return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
+}
+
+// Returns the total number of workgruops in the grid.
+_DEFAULT_ATTRS static inline uint64_t _get_num_blocks() {
+  return _get_num_blocks_x() * _get_num_blocks_y() * _get_num_blocks_z();
+}
+
+// Returns the 'x' dimension of the current AMD workgroup's id.
+_DEFAULT_ATTRS static inline uint32_t _get_block_id_x() {
+  return __builtin_amdgcn_workgroup_id_x();
+}
+
+// Returns the 'y' dimension of the current AMD workgroup's id.
+_DEFAULT_ATTRS static inline uint32_t _get_block_id_y() {
+  return __builtin_amdgcn_workgroup_id_y();
+}
+
+// Returns the 'z' dimension of the current AMD workgroup's id.
+_DEFAULT_ATTRS static inline uint32_t _get_block_id_z() {
+  return __builtin_amdgcn_workgroup_id_z();
+}
+
+// Returns the absolute id of the AMD workgroup.
+_DEFAULT_ATTRS static inline uint64_t _get_block_id() {
+  return _get_block_id_x() + _get_num_blocks_x() * _get_block_id_y() +
+ _get_num_blocks_x() * _get_num_blocks_y() * _get_block_id_z();
+}
+
+// Returns the number of workitems in the 'x' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_threads_x() {
+  return __builtin_amdgcn_workgroup_size_x();
+}
+
+// Returns the number of workitems in the 'y' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_threads_y() {
+  return __builtin_amdgcn_workgroup_size_y();
+}
+
+// Returns the number of workitems in the 'z' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_threads_z() {
+  return __builtin_amdgcn_workgroup_size_z();
+}
+
+// Returns the total number of workitems in the workgroup.
+_DEFAULT_ATTRS static inline uint64_t _get_num_threads() {
+  return _get_num_threads_x() * _get_num_threads_y() * _get_num_threads_z();
+}
+
+// Returns the 'x' dimension id of the workitem in the current AMD workgroup.
+_DEFAULT_ATTRS static inline uint32_t _get_thread_id_x() {
+  return __builtin_amdgcn_workitem_id_x();
+}
+
+// Returns the 'y' dimension id of the workitem in the current AMD workgroup.
+_DEFAULT_ATTRS static inline uint32_t _get_thread_id_y() {
+  return __builtin_amdgcn_workitem_id_y();
+}
+
+// Returns the 'z' dimension id of the workitem in the current AMD workgroup.
+_DEFAULT_ATTRS static inline uint32_t _get_thread_id_z() {
+  return __builtin_amdgcn_workitem_id_z();
+}
+
+// Returns the absolute id of the thread in the current AMD workgroup.
+_DEFAULT_ATTRS static inline uint64_t _get_thread_id() {
+  return _get_thread_id_x() + _get_num_threads_x() * _get_thread_id_y() +
+ _get_num_threads_x() * _get_num_threads_y() * _get_thread_id_z();
+}
+
+// Returns the size of an AMD wavefront, either 32 or 64 depending on hardware
+// and compilation options.
+_DEFAULT_ATTRS static inline uint32_t _get_lane_size() {

jhuber6 wrote:

Yeah, this is the one name that I really struggled with since there's a lot of 
different words. It could be `get_num_lanes()` to indicate more that it's the 
SIMT level. Either that or just accept `wave` or `warp` as the go-to word.

https://github.com/llvm/llvm-project/pull/110179
__

[clang] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-09-27 Thread Joseph Huber via cfe-commits


@@ -0,0 +1,187 @@
+//===-- amdgpuintrin.h - AMDPGU intrinsic functions 
---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef __AMDGPUINTRIN_H
+#define __AMDGPUINTRIN_H
+
+#ifndef __AMDGPU__
+#error "This file is intended for AMDGPU targets or offloading to AMDGPU
+#endif
+
+#include 
+#include 
+
+#if defined(__HIP__) || defined(__CUDA__)
+#define _DEFAULT_ATTRS __attribute__((device)) __attribute__((always_inline))
+#else
+#define _DEFAULT_ATTRS __attribute__((always_inline))
+#endif
+
+#pragma omp begin declare target device_type(nohost)
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+// Type aliases to the address spaces used by the AMDGPU backend.
+#define _private __attribute__((opencl_private))
+#define _constant __attribute__((opencl_constant))
+#define _local __attribute__((opencl_local))
+#define _global __attribute__((opencl_global))
+
+// Attribute to declare a function as a kernel.
+#define _kernel __attribute__((amdgpu_kernel, visibility("protected")))
+
+// Returns the number of workgroups in the 'x' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_x() {
+  return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
+}
+
+// Returns the number of workgroups in the 'y' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_y() {
+  return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
+}
+
+// Returns the number of workgroups in the 'z' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_z() {
+  return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
+}
+
+// Returns the total number of workgruops in the grid.
+_DEFAULT_ATTRS static inline uint64_t _get_num_blocks() {
+  return _get_num_blocks_x() * _get_num_blocks_y() * _get_num_blocks_z();
+}
+
+// Returns the 'x' dimension of the current AMD workgroup's id.
+_DEFAULT_ATTRS static inline uint32_t _get_block_id_x() {
+  return __builtin_amdgcn_workgroup_id_x();
+}
+
+// Returns the 'y' dimension of the current AMD workgroup's id.
+_DEFAULT_ATTRS static inline uint32_t _get_block_id_y() {
+  return __builtin_amdgcn_workgroup_id_y();
+}
+
+// Returns the 'z' dimension of the current AMD workgroup's id.
+_DEFAULT_ATTRS static inline uint32_t _get_block_id_z() {
+  return __builtin_amdgcn_workgroup_id_z();
+}
+
+// Returns the absolute id of the AMD workgroup.
+_DEFAULT_ATTRS static inline uint64_t _get_block_id() {
+  return _get_block_id_x() + _get_num_blocks_x() * _get_block_id_y() +
+ _get_num_blocks_x() * _get_num_blocks_y() * _get_block_id_z();
+}
+
+// Returns the number of workitems in the 'x' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_threads_x() {
+  return __builtin_amdgcn_workgroup_size_x();
+}
+
+// Returns the number of workitems in the 'y' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_threads_y() {
+  return __builtin_amdgcn_workgroup_size_y();
+}
+
+// Returns the number of workitems in the 'z' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_threads_z() {
+  return __builtin_amdgcn_workgroup_size_z();
+}
+
+// Returns the total number of workitems in the workgroup.
+_DEFAULT_ATTRS static inline uint64_t _get_num_threads() {
+  return _get_num_threads_x() * _get_num_threads_y() * _get_num_threads_z();
+}
+
+// Returns the 'x' dimension id of the workitem in the current AMD workgroup.
+_DEFAULT_ATTRS static inline uint32_t _get_thread_id_x() {
+  return __builtin_amdgcn_workitem_id_x();
+}
+
+// Returns the 'y' dimension id of the workitem in the current AMD workgroup.
+_DEFAULT_ATTRS static inline uint32_t _get_thread_id_y() {
+  return __builtin_amdgcn_workitem_id_y();
+}
+
+// Returns the 'z' dimension id of the workitem in the current AMD workgroup.
+_DEFAULT_ATTRS static inline uint32_t _get_thread_id_z() {
+  return __builtin_amdgcn_workitem_id_z();
+}
+
+// Returns the absolute id of the thread in the current AMD workgroup.
+_DEFAULT_ATTRS static inline uint64_t _get_thread_id() {
+  return _get_thread_id_x() + _get_num_threads_x() * _get_thread_id_y() +
+ _get_num_threads_x() * _get_num_threads_y() * _get_thread_id_z();
+}
+
+// Returns the size of an AMD wavefront, either 32 or 64 depending on hardware
+// and compilation options.
+_DEFAULT_ATTRS static inline uint32_t _get_lane_size() {
+  return __builtin_amdgcn_wavefrontsize();
+}
+
+// Returns the id of the thread inside of an AMD wavefront executing together.
+_DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t _get_lane_id() {
+  return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
+}
+
+// Returns the bit-mask of active threads in

[clang] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-09-27 Thread Joseph Huber via cfe-commits


@@ -0,0 +1,184 @@
+//===-- nvptxintrin.h - NVPTX intrinsic functions 
-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef __NVPTXINTRIN_H
+#define __NVPTXINTRIN_H
+
+#ifndef __NVPTX__
+#error "This file is intended for NVPTX targets or offloading to NVPTX
+#endif
+
+#include 
+#include 
+
+#if defined(__HIP__) || defined(__CUDA__)
+#define _DEFAULT_ATTRS __attribute__((device)) __attribute__((always_inline))
+#else
+#define _DEFAULT_ATTRS __attribute__((always_inline))
+#endif
+
+#pragma omp begin declare target device_type(nohost)
+#pragma omp begin declare variant match(device = {arch(nvptx64)})
+
+// Type aliases to the address spaces used by the NVPTX backend.
+#define _private __attribute__((opencl_private))
+#define _constant __attribute__((opencl_constant))
+#define _local __attribute__((opencl_local))
+#define _global __attribute__((opencl_global))
+
+// Attribute to declare a function as a kernel.
+#define _kernel __attribute__((nvptx_kernel))
+
+// Returns the number of CUDA blocks in the 'x' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_x() {
+  return __nvvm_read_ptx_sreg_nctaid_x();
+}
+
+// Returns the number of CUDA blocks in the 'y' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_y() {
+  return __nvvm_read_ptx_sreg_nctaid_y();
+}
+
+// Returns the number of CUDA blocks in the 'z' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_z() {
+  return __nvvm_read_ptx_sreg_nctaid_z();
+}
+
+// Returns the total number of CUDA blocks.
+_DEFAULT_ATTRS static inline uint64_t _get_num_blocks() {
+  return _get_num_blocks_x() * _get_num_blocks_y() * _get_num_blocks_z();
+}
+
+// Returns the 'x' dimension of the current CUDA block's id.
+_DEFAULT_ATTRS static inline uint32_t _get_block_id_x() {
+  return __nvvm_read_ptx_sreg_ctaid_x();
+}
+
+// Returns the 'y' dimension of the current CUDA block's id.
+_DEFAULT_ATTRS static inline uint32_t _get_block_id_y() {
+  return __nvvm_read_ptx_sreg_ctaid_y();
+}
+
+// Returns the 'z' dimension of the current CUDA block's id.
+_DEFAULT_ATTRS static inline uint32_t _get_block_id_z() {
+  return __nvvm_read_ptx_sreg_ctaid_z();
+}
+
+// Returns the absolute id of the CUDA block.
+_DEFAULT_ATTRS static inline uint64_t _get_block_id() {
+  return _get_block_id_x() + _get_num_blocks_x() * _get_block_id_y() +
+ _get_num_blocks_x() * _get_num_blocks_y() * _get_block_id_z();
+}
+
+// Returns the number of CUDA threads in the 'x' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_threads_x() {
+  return __nvvm_read_ptx_sreg_ntid_x();
+}
+
+// Returns the number of CUDA threads in the 'y' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_threads_y() {
+  return __nvvm_read_ptx_sreg_ntid_y();
+}
+
+// Returns the number of CUDA threads in the 'z' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_threads_z() {
+  return __nvvm_read_ptx_sreg_ntid_z();
+}
+
+// Returns the total number of threads in the block.
+_DEFAULT_ATTRS static inline uint64_t _get_num_threads() {
+  return _get_num_threads_x() * _get_num_threads_y() * _get_num_threads_z();
+}
+
+// Returns the 'x' dimension id of the thread in the current CUDA block.
+_DEFAULT_ATTRS static inline uint32_t _get_thread_id_x() {
+  return __nvvm_read_ptx_sreg_tid_x();
+}
+
+// Returns the 'y' dimension id of the thread in the current CUDA block.
+_DEFAULT_ATTRS static inline uint32_t _get_thread_id_y() {
+  return __nvvm_read_ptx_sreg_tid_y();
+}
+
+// Returns the 'z' dimension id of the thread in the current CUDA block.
+_DEFAULT_ATTRS static inline uint32_t _get_thread_id_z() {
+  return __nvvm_read_ptx_sreg_tid_z();
+}
+
+// Returns the absolute id of the thread in the current CUDA block.
+_DEFAULT_ATTRS static inline uint64_t _get_thread_id() {
+  return _get_thread_id_x() + _get_num_threads_x() * _get_thread_id_y() +
+ _get_num_threads_x() * _get_num_threads_y() * _get_thread_id_z();
+}
+
+// Returns the size of a CUDA warp, always 32 on NVIDIA hardware.
+_DEFAULT_ATTRS static inline uint32_t _get_lane_size() { return 32; }

jhuber6 wrote:

That's not exposed to `clang` as a builtin, but I could (and probably should) 
add it.

https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] 787a6d5 - [nvlink-wrapper] Remove use of symlinks

2024-09-27 Thread Joseph Huber via cfe-commits

Author: Joseph Huber
Date: 2024-09-27T12:05:56-05:00
New Revision: 787a6d57f95ff6eaee8df01392900a6eea512930

URL: 
https://github.com/llvm/llvm-project/commit/787a6d57f95ff6eaee8df01392900a6eea512930
DIFF: 
https://github.com/llvm/llvm-project/commit/787a6d57f95ff6eaee8df01392900a6eea512930.diff

LOG: [nvlink-wrapper] Remove use of symlinks

Summary:
This was intended to be a neat optimization, but some objects come from
archives so this won't work. I could write some code to detect if it
came from an archive but I don't think it's wroth the complexity when
this already doesn't work on Windows.

Added: 


Modified: 
clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp

Removed: 




diff  --git a/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp 
b/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp
index 8ec1f722fa8a10..b4b376fe0d114e 100644
--- a/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp
+++ b/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp
@@ -236,9 +236,8 @@ void printCommands(ArrayRef CmdArgs) {
   if (CmdArgs.empty())
 return;
 
-  llvm::errs() << " \"" << CmdArgs.front() << "\" ";
-  llvm::errs() << llvm::join(std::next(CmdArgs.begin()), CmdArgs.end(), " ")
-   << "\n";
+  errs() << " \"" << CmdArgs.front() << "\" ";
+  errs() << join(std::next(CmdArgs.begin()), CmdArgs.end(), " ") << "\n";
 }
 
 /// A minimum symbol interface that provides the necessary information to
@@ -329,12 +328,12 @@ Expected> createLTO(const 
ArgList &Args) {
   lto::ThinBackend Backend;
   unsigned Jobs = 0;
   if (auto *Arg = Args.getLastArg(OPT_jobs))
-if (!llvm::to_integer(Arg->getValue(), Jobs) || Jobs == 0)
+if (!to_integer(Arg->getValue(), Jobs) || Jobs == 0)
   reportError(createStringError("%s: expected a positive integer, got 
'%s'",
 Arg->getSpelling().data(),
 Arg->getValue()));
-  Backend = lto::createInProcessThinBackend(
-  llvm::heavyweight_hardware_concurrency(Jobs));
+  Backend =
+  lto::createInProcessThinBackend(heavyweight_hardware_concurrency(Jobs));
 
   Conf.CPU = Args.getLastArgValue(OPT_arch);
   Conf.Options = codegen::InitTargetOptionsFromCodeGenFlags(Triple);
@@ -378,7 +377,7 @@ Expected> createLTO(const ArgList 
&Args) {
 
   unsigned Partitions = 1;
   if (auto *Arg = Args.getLastArg(OPT_lto_partitions))
-if (!llvm::to_integer(Arg->getValue(), Partitions) || Partitions == 0)
+if (!to_integer(Arg->getValue(), Partitions) || Partitions == 0)
   reportError(createStringError("%s: expected a positive integer, got 
'%s'",
 Arg->getSpelling().data(),
 Arg->getValue()));
@@ -510,7 +509,7 @@ Expected> getInput(const ArgList 
&Args) {
   InputFiles.emplace_back(std::move(*BufferOrErr), /*IsLazy=*/false);
   break;
 case file_magic::archive: {
-  Expected> LibFile =
+  Expected> LibFile =
   object::Archive::create(Buffer);
   if (!LibFile)
 return LibFile.takeError();
@@ -563,7 +562,7 @@ Expected> getInput(const ArgList 
&Args) {
   for (auto &Input : LinkerInput)
 if (identify_magic(Input->getBuffer()) == file_magic::bitcode)
   BitcodeFiles.emplace_back(std::move(Input));
-  llvm::erase_if(LinkerInput, [](const auto &F) { return !F; });
+  erase_if(LinkerInput, [](const auto &F) { return !F; });
 
   // Run the LTO pipeline on the extracted inputs.
   SmallVector Files;
@@ -574,7 +573,7 @@ Expected> getInput(const ArgList 
&Args) {
 lto::LTO  BitcodeFileOrErr =
-  llvm::lto::InputFile::create(*BitcodeFile);
+  lto::InputFile::create(*BitcodeFile);
   if (!BitcodeFileOrErr)
 return BitcodeFileOrErr.takeError();
 
@@ -638,7 +637,7 @@ Expected> getInput(const ArgList 
&Args) {
   if (std::error_code EC = sys::fs::openFileForWrite(TempFile, FD))
 reportError(errorCodeToError(EC));
   return std::make_unique(
-  std::make_unique(FD, true));
+  std::make_unique(FD, true));
 };
 
 if (Error Err = LTOBackend.run(AddStream))
@@ -655,11 +654,11 @@ Expected> getInput(const ArgList 
&Args) {
 }
   }
 
-  // Create a link for each file to a new file ending in `.cubin`. The 'nvlink'
+  // Create a copy for each file to a new file ending in `.cubin`. The 'nvlink'
   // linker requires all NVPTX inputs to have this extension for some reason.
-  // Windows cannot create symbolic links so we just copy the whole file.
+  // We don't use a symbolic link because it's not supported on Windows and 
some
+  // of this input files could be extracted from an archive.
   for (auto &Input : LinkerInput) {
-#ifdef _WIN32
 auto TempFileOrErr = createTempFile(
 Args, sys::path::stem(Input->get

[clang] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-09-27 Thread Joseph Huber via cfe-commits

jhuber6 wrote:

> Probably want a longer prefix. _gpu or_llvm or similar.

Yeah, just wasn't sure. Also, do resource headers need to be in a reserved 
namespace? Probably nothing wrong with `gpu_get_thread_id` vs 
`_gpu_get_thread_id`.



> If the shared header gets the declarations then people can include the 
> intrin.h and look at it to see what functions they have available, without 
> going and looking through all the implementations. That seems like a good 
> thing. Can put descriptive comments in the main header then.

Yeah I was actually wondering if I should go for something like this:
```c
#ifdef __NVPTX__
uint32_t nvptx_get_thread_id_x() { return __nvvm_ptx_read_sreg_tid_x(); }
#define IMPL nvptx
#endif
uint32_t gpu_get_thread_id_x() { return ##IMPL##_get_thread_id_x(); }
#undef IMPL
```

https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [nvlink-wrapper] Use a symbolic link instead of copying the file (PR #110139)

2024-09-26 Thread Joseph Huber via cfe-commits

https://github.com/jhuber6 closed 
https://github.com/llvm/llvm-project/pull/110139
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-09-26 Thread Joseph Huber via cfe-commits

https://github.com/jhuber6 updated 
https://github.com/llvm/llvm-project/pull/110179

>From f5a8afe139a25f13989556d40e29b98788934dd9 Mon Sep 17 00:00:00 2001
From: Joseph Huber 
Date: Thu, 26 Sep 2024 16:47:14 -0500
Subject: [PATCH] [Clang] Implement resource directory headers for common GPU
 intrinsics

Summary:
All GPU based languages provide some way to access things like the
thread ID or other resources. However, this is spread between many
different languages and it varies between targets. The goal here is to
provide a resource directory header that just provides these in an
easier to understand way, primarily so this can be used for C/C++ code.
The interface aims to be common, to faciliate easier porting, but target
specific stuff could be put in the individual headers.
---
 clang/lib/Headers/CMakeLists.txt |  14 +++
 clang/lib/Headers/amdgpuintrin.h | 187 +++
 clang/lib/Headers/gpuintrin.h|  18 +++
 clang/lib/Headers/nvptxintrin.h  | 184 ++
 4 files changed, 403 insertions(+)
 create mode 100644 clang/lib/Headers/amdgpuintrin.h
 create mode 100644 clang/lib/Headers/gpuintrin.h
 create mode 100644 clang/lib/Headers/nvptxintrin.h

diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index f5cc07c303f9eb..b439db1b2ac169 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -267,6 +267,12 @@ set(x86_files
   cpuid.h
   )
 
+set(gpu_files
+  gpuintrin.h
+  nvptxintrin.h
+  amdgpuintrin.h
+  )
+
 set(windows_only_files
   intrin0.h
   intrin.h
@@ -295,6 +301,7 @@ set(files
   ${systemz_files}
   ${ve_files}
   ${x86_files}
+  ${gpu_files}
   ${webassembly_files}
   ${windows_only_files}
   ${utility_files}
@@ -517,6 +524,7 @@ add_header_target("systemz-resource-headers" 
"${systemz_files};${zos_wrapper_fil
 add_header_target("ve-resource-headers" "${ve_files}")
 add_header_target("webassembly-resource-headers" "${webassembly_files}")
 add_header_target("x86-resource-headers" "${x86_files}")
+add_header_target("gpu-resource-headers" "${gpu_files}")
 
 # Other header groupings
 add_header_target("hlsl-resource-headers" ${hlsl_files})
@@ -703,6 +711,12 @@ install(
   EXCLUDE_FROM_ALL
   COMPONENT x86-resource-headers)
 
+install(
+  FILES ${gpu_files}
+  DESTINATION ${header_install_dir}
+  EXCLUDE_FROM_ALL
+  COMPONENT gpu-resource-headers)
+
 if(NOT CLANG_ENABLE_HLSL)
   set(EXCLUDE_HLSL EXCLUDE_FROM_ALL)
 endif()
diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
new file mode 100644
index 00..95936f86bd15b8
--- /dev/null
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -0,0 +1,187 @@
+//===-- amdgpuintrin.h - AMDPGU intrinsic functions 
---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef __AMDGPUINTRIN_H
+#define __AMDGPUINTRIN_H
+
+#ifndef __AMDGPU__
+#error "This file is intended for AMDGPU targets or offloading to AMDGPU
+#endif
+
+#include 
+#include 
+
+#if defined(__HIP__) || defined(__CUDA__)
+#define _DEFAULT_ATTRS __attribute__((device)) __attribute__((always_inline))
+#else
+#define _DEFAULT_ATTRS __attribute__((always_inline))
+#endif
+
+#pragma omp begin declare target device_type(nohost)
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+// Type aliases to the address spaces used by the AMDGPU backend.
+#define _private __attribute__((opencl_private))
+#define _constant __attribute__((opencl_constant))
+#define _local __attribute__((opencl_local))
+#define _global __attribute__((opencl_global))
+
+// Attribute to declare a function as a kernel.
+#define _kernel __attribute__((amdgpu_kernel, visibility("protected")))
+
+// Returns the number of workgroups in the 'x' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_x() {
+  return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
+}
+
+// Returns the number of workgroups in the 'y' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_y() {
+  return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
+}
+
+// Returns the number of workgroups in the 'z' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_z() {
+  return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
+}
+
+// Returns the total number of workgruops in the grid.
+_DEFAULT_ATTRS static inline uint64_t _get_num_blocks() {
+  return _get_num_blocks_x() * _get_num_blocks_y() * _get_num_blocks_z();
+}
+
+// Returns the 'x' dimension of the current AMD workgroup's id.
+_DEFAULT_ATTRS static inline uint32_t _get_block_id_x() {
+  return __builtin_amdgcn_workgroup_id_x();
+}
+
+// Returns the 'y' dim

[clang] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-09-26 Thread Joseph Huber via cfe-commits

https://github.com/jhuber6 created 
https://github.com/llvm/llvm-project/pull/110179

Summary:
All GPU based languages provide some way to access things like the
thread ID or other resources. However, this is spread between many
different languages and it varies between targets. The goal here is to
provide a resource directory header that just provides these in an
easier to understand way, primarily so this can be used for C/C++ code.
The interface aims to be common, to faciliate easier porting, but target
specific stuff could be put in the individual headers.


>From 8a817316b38e0c9b3847898037e71076de23f92c Mon Sep 17 00:00:00 2001
From: Joseph Huber 
Date: Thu, 26 Sep 2024 16:47:14 -0500
Subject: [PATCH] [Clang] Implement resource directory headers for common GPU
 intrinsics

Summary:
All GPU based languages provide some way to access things like the
thread ID or other resources. However, this is spread between many
different languages and it varies between targets. The goal here is to
provide a resource directory header that just provides these in an
easier to understand way, primarily so this can be used for C/C++ code.
The interface aims to be common, to faciliate easier porting, but target
specific stuff could be put in the individual headers.
---
 clang/lib/Headers/CMakeLists.txt |  14 +++
 clang/lib/Headers/amdgpuintrin.h | 193 +++
 clang/lib/Headers/gpuintrin.h|  18 +++
 clang/lib/Headers/nvptxintrin.h  | 190 ++
 4 files changed, 415 insertions(+)
 create mode 100644 clang/lib/Headers/amdgpuintrin.h
 create mode 100644 clang/lib/Headers/gpuintrin.h
 create mode 100644 clang/lib/Headers/nvptxintrin.h

diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index f5cc07c303f9eb..b439db1b2ac169 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -267,6 +267,12 @@ set(x86_files
   cpuid.h
   )
 
+set(gpu_files
+  gpuintrin.h
+  nvptxintrin.h
+  amdgpuintrin.h
+  )
+
 set(windows_only_files
   intrin0.h
   intrin.h
@@ -295,6 +301,7 @@ set(files
   ${systemz_files}
   ${ve_files}
   ${x86_files}
+  ${gpu_files}
   ${webassembly_files}
   ${windows_only_files}
   ${utility_files}
@@ -517,6 +524,7 @@ add_header_target("systemz-resource-headers" 
"${systemz_files};${zos_wrapper_fil
 add_header_target("ve-resource-headers" "${ve_files}")
 add_header_target("webassembly-resource-headers" "${webassembly_files}")
 add_header_target("x86-resource-headers" "${x86_files}")
+add_header_target("gpu-resource-headers" "${gpu_files}")
 
 # Other header groupings
 add_header_target("hlsl-resource-headers" ${hlsl_files})
@@ -703,6 +711,12 @@ install(
   EXCLUDE_FROM_ALL
   COMPONENT x86-resource-headers)
 
+install(
+  FILES ${gpu_files}
+  DESTINATION ${header_install_dir}
+  EXCLUDE_FROM_ALL
+  COMPONENT gpu-resource-headers)
+
 if(NOT CLANG_ENABLE_HLSL)
   set(EXCLUDE_HLSL EXCLUDE_FROM_ALL)
 endif()
diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
new file mode 100644
index 00..55f4148d5b9d32
--- /dev/null
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -0,0 +1,193 @@
+//===-- amdgpuintrin.h - AMDPGU intrinsic functions 
---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef __AMDGPUINTRIN_H
+#define __AMDGPUINTRIN_H
+
+#ifndef __AMDGPU__
+#error "This file is intended for AMDGPU targets or offloading to AMDGPU
+#endif
+
+#include 
+#include 
+
+#if defined(__HIP__) || defined(__CUDA__)
+#define _DEFAULT_ATTRS __attribute__((device)) __attribute__((always_inline))
+#else
+#define _DEFAULT_ATTRS __attribute__((always_inline))
+#endif
+
+#pragma omp begin declare target device_type(nohost)
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+// Type aliases to the address spaces used by the AMDGPU backend.
+#define _private __attribute__((opencl_private))
+#define _constant __attribute__((opencl_constant))
+#define _local __attribute__((opencl_local))
+#define _global __attribute__((opencl_global))
+
+// Some helpers for OpenCL-like vectors types.
+#define _vector1 __attribute__((ext_vector_type(1)))
+#define _vector2 __attribute__((ext_vector_type(2)))
+#define _vector3 __attribute__((ext_vector_type(3)))
+#define _vector4 __attribute__((ext_vector_type(4)))
+
+// Attribute to declare a function as a kernel.
+#define _kernel __attribute__((amdgpu_kernel, visibility("protected")))
+
+// Returns the number of workgroups in the 'x' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_x() {
+  return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
+}
+
+// Returns the number of workgroups in the 'y' dimension of the grid.
+_DEFA

[clang] [nvlink-wrapper] Use a symbolic link instead of copying the file (PR #110139)

2024-09-26 Thread Joseph Huber via cfe-commits

https://github.com/jhuber6 updated 
https://github.com/llvm/llvm-project/pull/110139

>From 393e05145d0c31a3b1b254f97a357c776617898c Mon Sep 17 00:00:00 2001
From: Joseph Huber 
Date: Thu, 26 Sep 2024 11:04:46 -0500
Subject: [PATCH] [nvlink-wrapper] Use a symbolic link instead of copying the
 file

Summary:
We need all inputs to `nvlink` to end in `.cubin` while the rest of the
compiler toolchain wants `.o`. Previously we copied `.o` file to
`.cubin` files, but this is wasteful. Instead, we can just create a link
against it. This saves some disk space during link time.
---
 .../ClangNVLinkWrapper.cpp| 25 ---
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp 
b/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp
index 871fe5e4553ccb..bdc1cc40496370 100644
--- a/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp
+++ b/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp
@@ -655,22 +655,19 @@ Expected> getInput(const ArgList 
&Args) {
 }
   }
 
-  // Copy all of the input files to a new file ending in `.cubin`. The 'nvlink'
+  // Create a link for each file to a new file ending in `.cubin`. The 'nvlink'
   // linker requires all NVPTX inputs to have this extension for some reason.
   for (auto &Input : LinkerInput) {
-auto TempFileOrErr = createTempFile(
-Args, sys::path::stem(Input->getBufferIdentifier()), "cubin");
-if (!TempFileOrErr)
-  return TempFileOrErr.takeError();
-Expected> OutputOrErr =
-FileOutputBuffer::create(*TempFileOrErr, Input->getBuffer().size());
-if (!OutputOrErr)
-  return OutputOrErr.takeError();
-std::unique_ptr Output = std::move(*OutputOrErr);
-llvm::copy(Input->getBuffer(), Output->getBufferStart());
-if (Error E = Output->commit())
-  return E;
-Files.emplace_back(Args.MakeArgString(*TempFileOrErr));
+SmallString<128> TempFile;
+if (std::error_code EC = sys::fs::getPotentiallyUniqueTempFileName(
+sys::path::stem(Input->getBufferIdentifier()), "cubin", TempFile))
+  reportError(createFileError(TempFile, EC));
+if (std::error_code EC =
+sys::fs::create_link(Input->getBufferIdentifier(), TempFile)) {
+  reportError(createFileError(TempFile, EC));
+}
+Files.emplace_back(Args.MakeArgString(TempFile));
+TempFiles.emplace_back(std::move(TempFile));
   }
 
   return Files;

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [nvlink-wrapper] Use a symbolic link instead of copying the file (PR #110139)

2024-09-26 Thread Joseph Huber via cfe-commits

https://github.com/jhuber6 updated 
https://github.com/llvm/llvm-project/pull/110139

>From 393e05145d0c31a3b1b254f97a357c776617898c Mon Sep 17 00:00:00 2001
From: Joseph Huber 
Date: Thu, 26 Sep 2024 11:04:46 -0500
Subject: [PATCH 1/2] [nvlink-wrapper] Use a symbolic link instead of copying
 the file

Summary:
We need all inputs to `nvlink` to end in `.cubin` while the rest of the
compiler toolchain wants `.o`. Previously we copied `.o` file to
`.cubin` files, but this is wasteful. Instead, we can just create a link
against it. This saves some disk space during link time.
---
 .../ClangNVLinkWrapper.cpp| 25 ---
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp 
b/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp
index 871fe5e4553ccb..bdc1cc40496370 100644
--- a/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp
+++ b/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp
@@ -655,22 +655,19 @@ Expected> getInput(const ArgList 
&Args) {
 }
   }
 
-  // Copy all of the input files to a new file ending in `.cubin`. The 'nvlink'
+  // Create a link for each file to a new file ending in `.cubin`. The 'nvlink'
   // linker requires all NVPTX inputs to have this extension for some reason.
   for (auto &Input : LinkerInput) {
-auto TempFileOrErr = createTempFile(
-Args, sys::path::stem(Input->getBufferIdentifier()), "cubin");
-if (!TempFileOrErr)
-  return TempFileOrErr.takeError();
-Expected> OutputOrErr =
-FileOutputBuffer::create(*TempFileOrErr, Input->getBuffer().size());
-if (!OutputOrErr)
-  return OutputOrErr.takeError();
-std::unique_ptr Output = std::move(*OutputOrErr);
-llvm::copy(Input->getBuffer(), Output->getBufferStart());
-if (Error E = Output->commit())
-  return E;
-Files.emplace_back(Args.MakeArgString(*TempFileOrErr));
+SmallString<128> TempFile;
+if (std::error_code EC = sys::fs::getPotentiallyUniqueTempFileName(
+sys::path::stem(Input->getBufferIdentifier()), "cubin", TempFile))
+  reportError(createFileError(TempFile, EC));
+if (std::error_code EC =
+sys::fs::create_link(Input->getBufferIdentifier(), TempFile)) {
+  reportError(createFileError(TempFile, EC));
+}
+Files.emplace_back(Args.MakeArgString(TempFile));
+TempFiles.emplace_back(std::move(TempFile));
   }
 
   return Files;

>From 315a9845c220277ce70d61556b13703d50a0b1aa Mon Sep 17 00:00:00 2001
From: Joseph Huber 
Date: Thu, 26 Sep 2024 14:44:34 -0500
Subject: [PATCH 2/2] Fix Windows

---
 .../clang-nvlink-wrapper/ClangNVLinkWrapper.cpp | 17 +
 1 file changed, 17 insertions(+)

diff --git a/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp 
b/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp
index bdc1cc40496370..8ec1f722fa8a10 100644
--- a/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp
+++ b/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp
@@ -657,7 +657,23 @@ Expected> getInput(const ArgList 
&Args) {
 
   // Create a link for each file to a new file ending in `.cubin`. The 'nvlink'
   // linker requires all NVPTX inputs to have this extension for some reason.
+  // Windows cannot create symbolic links so we just copy the whole file.
   for (auto &Input : LinkerInput) {
+#ifdef _WIN32
+auto TempFileOrErr = createTempFile(
+Args, sys::path::stem(Input->getBufferIdentifier()), "cubin");
+if (!TempFileOrErr)
+  return TempFileOrErr.takeError();
+Expected> OutputOrErr =
+FileOutputBuffer::create(*TempFileOrErr, Input->getBuffer().size());
+if (!OutputOrErr)
+  return OutputOrErr.takeError();
+std::unique_ptr Output = std::move(*OutputOrErr);
+llvm::copy(Input->getBuffer(), Output->getBufferStart());
+if (Error E = Output->commit())
+  return E;
+Files.emplace_back(Args.MakeArgString(*TempFileOrErr));
+#else
 SmallString<128> TempFile;
 if (std::error_code EC = sys::fs::getPotentiallyUniqueTempFileName(
 sys::path::stem(Input->getBufferIdentifier()), "cubin", TempFile))
@@ -668,6 +684,7 @@ Expected> getInput(const ArgList 
&Args) {
 }
 Files.emplace_back(Args.MakeArgString(TempFile));
 TempFiles.emplace_back(std::move(TempFile));
+#endif
   }
 
   return Files;

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [nvlink-wrapper] Use a symbolic link instead of copying the file (PR #110139)

2024-09-26 Thread Joseph Huber via cfe-commits

jhuber6 wrote:

Build bot says no, apparently. "The system cannot move the file to a different 
disk drive."

https://github.com/llvm/llvm-project/pull/110139
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [nvlink-wrapper] Use a symbolic link instead of copying the file (PR #110139)

2024-09-26 Thread Joseph Huber via cfe-commits

jhuber6 wrote:

> @rnk Are symlinks OK to use on windows?

There's a caveat in the implementation stating that the links are soft on Linux 
but hard on Windows (as soft links require super-user privileges). I'm pretty 
sure a hard link also does the job here? Since all we need to do is give the 
same data a different filename.

https://github.com/llvm/llvm-project/pull/110139
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [flang] [Flang][Driver][Offload] Support -Xoffload-linker argument in Flang (PR #109907)

2024-09-26 Thread Joseph Huber via cfe-commits


@@ -1,9 +1,10 @@
-! RUN %flang -### --target=x86_64-unknown-linux-gnu -fopenmp 
--offload-arch=gfx90a -Xoffload-linker a %s 2>&1 | FileCheck %s 
--check-prefixes=CHECK-XLINKER
+! Test the -Xoffload-linker flag that forwards link commands to the 
clang-linker-wrapper used
+! to help link offloading device libraries
 
-! CHECK-XLINKER {{.*}}--device-linker=a{{.*}}
+! RUN: %flang -### --target=x86_64-unknown-linux-gnu -fopenmp 
--offload-arch=gfx90a -Xoffload-linker a %s 2>&1 | FileCheck %s 
--check-prefixes=CHECK-XLINKER
 
-! RUN: %flang -### --target=x86_64-unknown-linux-gnu -fopenmp 
--offload-arch=gfx90a -Xoffload-linker a -Xoffload-linker-amdgcn-amd-amdhsa b 
%s 2>&1 | FileCheck %s --check-prefixes=CHECK-XLINKER-AMDGCN
+! CHECK-XLINKER: -device-linker=a{{.*}}-
 
-! CHECK-XLINKER-AMDGCN: 
{{.*}}"--device-linker=a"{{.*}}"--device-linker=amdgcn-amd-amdhsa=b"{{.*}}
+! RUN: %flang -### --target=x86_64-unknown-linux-gnu -fopenmp 
--offload-arch=gfx90a -Xoffload-linker a -Xoffload-linker-amdgcn-amd-amdhsa b 
%s 2>&1 | FileCheck %s --check-prefixes=CHECK-XLINKER-AMDGCN
 
-end program
+! CHECK-XLINKER-AMDGCN: 
-device-linker=a{{.*}}-device-linker=amdgcn-amd-amdhsa=b{{.*}}--

jhuber6 wrote:

Why is there a `--` here

https://github.com/llvm/llvm-project/pull/109907
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [nvlink-wrapper] Use a symbolic link instead of copying the file (PR #110139)

2024-09-26 Thread Joseph Huber via cfe-commits

https://github.com/jhuber6 created 
https://github.com/llvm/llvm-project/pull/110139

Summary:
We need all inputs to `nvlink` to end in `.cubin` while the rest of the
compiler toolchain wants `.o`. Previously we copied `.o` file to
`.cubin` files, but this is wasteful. Instead, we can just create a link
against it. This saves some disk space during link time.


>From 8c1cd42f2b860f5b9640f98e05e242f2003d35ba Mon Sep 17 00:00:00 2001
From: Joseph Huber 
Date: Thu, 26 Sep 2024 11:04:46 -0500
Subject: [PATCH] [nvlink-wrapper] Use a symbolic link instead of copying the
 file

Summary:
We need all inputs to `nvlink` to end in `.cubin` while the rest of the
compiler toolchain wants `.o`. Previously we copied `.o` file to
`.cubin` files, but this is wasteful. Instead, we can just create a link
against it. This saves some disk space during link time.
---
 .../clang-nvlink-wrapper/ClangNVLinkWrapper.cpp | 13 -
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp 
b/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp
index 871fe5e4553ccb..19faef8dc07e45 100644
--- a/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp
+++ b/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp
@@ -655,21 +655,16 @@ Expected> getInput(const ArgList 
&Args) {
 }
   }
 
-  // Copy all of the input files to a new file ending in `.cubin`. The 'nvlink'
+  // Create a link for each file to a new file ending in `.cubin`. The 'nvlink'
   // linker requires all NVPTX inputs to have this extension for some reason.
   for (auto &Input : LinkerInput) {
 auto TempFileOrErr = createTempFile(
 Args, sys::path::stem(Input->getBufferIdentifier()), "cubin");
 if (!TempFileOrErr)
   return TempFileOrErr.takeError();
-Expected> OutputOrErr =
-FileOutputBuffer::create(*TempFileOrErr, Input->getBuffer().size());
-if (!OutputOrErr)
-  return OutputOrErr.takeError();
-std::unique_ptr Output = std::move(*OutputOrErr);
-llvm::copy(Input->getBuffer(), Output->getBufferStart());
-if (Error E = Output->commit())
-  return E;
+if (std::error_code EC =
+sys::fs::create_link(Input->getBufferIdentifier(), *TempFileOrErr))
+  reportError(createFileError(*TempFileOrErr, EC));
 Files.emplace_back(Args.MakeArgString(*TempFileOrErr));
   }
 

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [flang] [Flang][Driver][Offload] Support -Xoffload-linker argument in Flang (PR #109907)

2024-09-26 Thread Joseph Huber via cfe-commits


@@ -1,9 +1,10 @@
-! RUN %flang -### --target=x86_64-unknown-linux-gnu -fopenmp 
--offload-arch=gfx90a -Xoffload-linker a %s 2>&1 | FileCheck %s 
--check-prefixes=CHECK-XLINKER
+! Test the -Xoffload-linker flag that forwards link commands to the 
clang-linker-wrapper used
+! to help link offloading device libraries
 
-! CHECK-XLINKER {{.*}}--device-linker=a{{.*}}
+! RUN: %flang -### --target=x86_64-unknown-linux-gnu -fopenmp 
--offload-arch=gfx90a -Xoffload-linker a %s 2>&1 | FileCheck %s 
--check-prefixes=CHECK-XLINKER
 
-! RUN: %flang -### --target=x86_64-unknown-linux-gnu -fopenmp 
--offload-arch=gfx90a -Xoffload-linker a -Xoffload-linker-amdgcn-amd-amdhsa b 
%s 2>&1 | FileCheck %s --check-prefixes=CHECK-XLINKER-AMDGCN
+! CHECK-XLINKER: -device-linker=a{{.*}}-

jhuber6 wrote:

and a `-` here?

https://github.com/llvm/llvm-project/pull/109907
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang] Automatically link the `compiler-rt` for GPUs if present (PR #109152)

2024-09-25 Thread Joseph Huber via cfe-commits

https://github.com/jhuber6 closed 
https://github.com/llvm/llvm-project/pull/109152
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [flang] [Flang][Driver][Offload] Support -Xoffload-linker argument in Flang (PR #109907)

2024-09-25 Thread Joseph Huber via cfe-commits

jhuber6 wrote:

Maybe it's the double dashes after the check? I guess while we're at it might 
as well check the `-Xoffload-linker-amdgcn-amd-amdhsa` format as well.

https://github.com/llvm/llvm-project/pull/109907
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [flang] [Flang][Driver][Offload] Support -Xoffload-linker argument in Flang (PR #109907)

2024-09-25 Thread Joseph Huber via cfe-commits

https://github.com/jhuber6 approved this pull request.


https://github.com/llvm/llvm-project/pull/109907
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Convert AMDGPUResourceUsageAnalysis pass from Module to MF pass (PR #102913)

2024-09-23 Thread Joseph Huber via cfe-commits

jhuber6 wrote:

If we already have per-function metadata, I'm wondering how difficult it would 
be to put this handling in the linker. AFAIK there's already handling for 
`call-graph-profile` which can inform the linker of the call-graph, so we could 
potentially just walk that graph, find the diameter of the register usage and 
then emit it in the final HSA metadata. There would still be the issue of LDS 
usage, but we could probably just state that LDS used by a kernel outside the 
current TU doesn't work for starters.

https://github.com/llvm/llvm-project/pull/102913
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [Coroutines] Change `llvm.coro.noop` to accept `llvm_anyptr_ty` instead (PR #102096)

2024-09-23 Thread Joseph Huber via cfe-commits

https://github.com/jhuber6 reopened 
https://github.com/llvm/llvm-project/pull/102096
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [Coroutines] Change `llvm.coro.noop` to accept `llvm_anyptr_ty` instead (PR #102096)

2024-09-23 Thread Joseph Huber via cfe-commits

https://github.com/jhuber6 closed 
https://github.com/llvm/llvm-project/pull/102096
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] 5b9206d - [Driver] Fix nvlink wrapper test

2024-09-22 Thread Joseph Huber via cfe-commits

Author: Joseph Huber
Date: 2024-09-22T08:16:01-05:00
New Revision: 5b9206dbe42a149f44cc267508d439717912cb1d

URL: 
https://github.com/llvm/llvm-project/commit/5b9206dbe42a149f44cc267508d439717912cb1d
DIFF: 
https://github.com/llvm/llvm-project/commit/5b9206dbe42a149f44cc267508d439717912cb1d.diff

LOG: [Driver] Fix nvlink wrapper test

Added: 


Modified: 
clang/test/Driver/nvlink-wrapper.c

Removed: 




diff  --git a/clang/test/Driver/nvlink-wrapper.c 
b/clang/test/Driver/nvlink-wrapper.c
index 2ef09b699eccb8..2b0993caee4248 100644
--- a/clang/test/Driver/nvlink-wrapper.c
+++ b/clang/test/Driver/nvlink-wrapper.c
@@ -82,6 +82,6 @@ int baz() { return y + x; }
 //
 // Check that '-plugin` is ingored like in `ld.lld`
 //
-// RUN: clang-nvlink-wrapper --dry-run %t.o -plugin -arch sm_52 -o a.out \
+// RUN: clang-nvlink-wrapper --dry-run %t.o -plugin foo.so -arch sm_52 -o 
a.out \
 // RUN:   2>&1 | FileCheck %s --check-prefix=PLUGIN
 // PLUGIN-NOT: -plugin



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] 68e2b69 - [NvlinkWrapper] Fix `-pluing` not consuming its argument

2024-09-22 Thread Joseph Huber via cfe-commits

Author: Joseph Huber
Date: 2024-09-22T08:04:20-05:00
New Revision: 68e2b695eae06b42261ecdc145c1f1ece57cd14c

URL: 
https://github.com/llvm/llvm-project/commit/68e2b695eae06b42261ecdc145c1f1ece57cd14c
DIFF: 
https://github.com/llvm/llvm-project/commit/68e2b695eae06b42261ecdc145c1f1ece57cd14c.diff

LOG: [NvlinkWrapper] Fix `-pluing` not consuming its argument

Summary:
Sometimes `clang` will pass `-plugin` when doing LTO, which should be
correctly consumed by the nvlink wrapper. Right now it was leaving the
`plugin.so` argument as a regular input, which would cause it to error
on the `.so` input.

Added: 


Modified: 
clang/tools/clang-nvlink-wrapper/NVLinkOpts.td

Removed: 




diff  --git a/clang/tools/clang-nvlink-wrapper/NVLinkOpts.td 
b/clang/tools/clang-nvlink-wrapper/NVLinkOpts.td
index ef1a7542e49502..eeb9d1a6228240 100644
--- a/clang/tools/clang-nvlink-wrapper/NVLinkOpts.td
+++ b/clang/tools/clang-nvlink-wrapper/NVLinkOpts.td
@@ -39,7 +39,7 @@ def library_S : Separate<["--", "-"], "library">, 
Flags<[HelpHidden]>,
 def library_EQ : Joined<["--", "-"], "library=">, Flags<[HelpHidden]>,
   Alias;
 
-def plugin : Joined<["--", "-"], "plugin">, 
+def plugin : JoinedOrSeparate<["--", "-"], "plugin">,
   Flags<[HelpHidden, WrapperOnlyOption]>;
 
 def arch : Separate<["--", "-"], "arch">,



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [AMDGPU] Correctly use the auxiliary toolchain to include libc++ (PR #109366)

2024-09-20 Thread Joseph Huber via cfe-commits

https://github.com/jhuber6 closed 
https://github.com/llvm/llvm-project/pull/109366
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [AMDGPU] Correctly use the auxiliary toolchain to include libc++ (PR #109366)

2024-09-20 Thread Joseph Huber via cfe-commits

jhuber6 wrote:

> The fix looks good. A test would be preferred.

Done

https://github.com/llvm/llvm-project/pull/109366
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [AMDGPU] Correctly use the auxiliary toolchain to include libc++ (PR #109366)

2024-09-20 Thread Joseph Huber via cfe-commits

https://github.com/jhuber6 updated 
https://github.com/llvm/llvm-project/pull/109366

>From f47b67c20014fbedc5ce9764be2e2687258a474e Mon Sep 17 00:00:00 2001
From: Joseph Huber 
Date: Thu, 19 Sep 2024 22:03:42 -0500
Subject: [PATCH] [AMDGPU] Correctly use the auxiliary toolchain to include
 libc++

Summary:
Now that we have a functional build for `libc++` on the GPU, it will now
find the target specific headers in `include/amdgcn-amd-amdhsa`. This is
a problem for offloading via OpenMP because we need the CPU and GPU
headers to match exactly. All the other toolchains forward this
correctly except the AMDGPU OpenMP one, fix this by overriding it to use
the host toolchain instead of the device one, so the triple is not
returned as `amdgcn-amd-amdhsa`.
---
 clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp | 5 +
 clang/lib/Driver/ToolChains/AMDGPUOpenMP.h   | 3 +++
 clang/test/Driver/amdgpu-openmp-toolchain.c  | 4 
 3 files changed, 12 insertions(+)

diff --git a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp 
b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
index d43e683e46852d..3f0b3f2d86b3ed 100644
--- a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
+++ b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
@@ -120,6 +120,11 @@ AMDGPUOpenMPToolChain::GetCXXStdlibType(const ArgList 
&Args) const {
   return HostTC.GetCXXStdlibType(Args);
 }
 
+void AMDGPUOpenMPToolChain::AddClangCXXStdlibIncludeArgs(
+const llvm::opt::ArgList &Args, llvm::opt::ArgStringList &CC1Args) const {
+  HostTC.AddClangCXXStdlibIncludeArgs(Args, CC1Args);
+}
+
 void AMDGPUOpenMPToolChain::AddClangSystemIncludeArgs(
 const ArgList &DriverArgs, ArgStringList &CC1Args) const {
   HostTC.AddClangSystemIncludeArgs(DriverArgs, CC1Args);
diff --git a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.h 
b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.h
index 2be444a42c55fa..0536c9f7f564c8 100644
--- a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.h
+++ b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.h
@@ -42,6 +42,9 @@ class LLVM_LIBRARY_VISIBILITY AMDGPUOpenMPToolChain final
 Action::OffloadKind DeviceOffloadKind) const override;
   void addClangWarningOptions(llvm::opt::ArgStringList &CC1Args) const 
override;
   CXXStdlibType GetCXXStdlibType(const llvm::opt::ArgList &Args) const 
override;
+  void AddClangCXXStdlibIncludeArgs(
+  const llvm::opt::ArgList &Args,
+  llvm::opt::ArgStringList &CC1Args) const override;
   void
   AddClangSystemIncludeArgs(const llvm::opt::ArgList &DriverArgs,
 llvm::opt::ArgStringList &CC1Args) const override;
diff --git a/clang/test/Driver/amdgpu-openmp-toolchain.c 
b/clang/test/Driver/amdgpu-openmp-toolchain.c
index a153c4afb0ce8c..184819b790c4ff 100644
--- a/clang/test/Driver/amdgpu-openmp-toolchain.c
+++ b/clang/test/Driver/amdgpu-openmp-toolchain.c
@@ -77,3 +77,7 @@
 // RUN:   %clang -### --target=x86_64-unknown-linux-gnu -emit-llvm -S -fopenmp 
-fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa 
-march=gfx803 -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-WARN-ATOMIC
 // CHECK-WARN-ATOMIC-NOT: "-cc1" "-triple" 
"x86_64-unknown-linux-gnu"{{.*}}"-Werror=atomic-alignment"
 // CHECK-WARN-ATOMIC: "-cc1" "-triple" 
"amdgcn-amd-amdhsa"{{.*}}"-Werror=atomic-alignment"
+
+// RUN:   %clang -### --target=x86_64-unknown-linux-gnu -emit-llvm -S -fopenmp 
--offload-arch=gfx803 \
+// RUN: -stdlib=libc++ -nogpulib %s 2>&1 | FileCheck %s 
--check-prefix=LIBCXX
+// LIBCXX-NOT: include/amdgcn-amd-amdhsa/c++/v1

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [AMDGPU] Correctly use the auxiliary toolchain to include libc++ (PR #109366)

2024-09-19 Thread Joseph Huber via cfe-commits

https://github.com/jhuber6 created 
https://github.com/llvm/llvm-project/pull/109366

Summary:
Now that we have a functional build for `libc++` on the GPU, it will now
find the target specific headers in `include/amdgcn-amd-amdhsa`. This is
a problem for offloading via OpenMP because we need the CPU and GPU
headers to match exactly. All the other toolchains forward this
correctly except the AMDGPU OpenMP one, fix this by overriding it to use
the host toolchain instead of the device one, so the triple is not
returned as `amdgcn-amd-amdhsa`.


>From 40be52a90d3f0d4fba61112343cbc54bbfb327b7 Mon Sep 17 00:00:00 2001
From: Joseph Huber 
Date: Thu, 19 Sep 2024 22:03:42 -0500
Subject: [PATCH] [AMDGPU] Correctly use the auxiliary toolchain to include
 libc++

Summary:
Now that we have a functional build for `libc++` on the GPU, it will now
find the target specific headers in `include/amdgcn-amd-amdhsa`. This is
a problem for offloading via OpenMP because we need the CPU and GPU
headers to match exactly. All the other toolchains forward this
correctly except the AMDGPU OpenMP one, fix this by overriding it to use
the host toolchain instead of the device one, so the triple is not
returned as `amdgcn-amd-amdhsa`.
---
 clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp | 5 +
 clang/lib/Driver/ToolChains/AMDGPUOpenMP.h   | 3 +++
 2 files changed, 8 insertions(+)

diff --git a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp 
b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
index d43e683e46852d..3f0b3f2d86b3ed 100644
--- a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
+++ b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
@@ -120,6 +120,11 @@ AMDGPUOpenMPToolChain::GetCXXStdlibType(const ArgList 
&Args) const {
   return HostTC.GetCXXStdlibType(Args);
 }
 
+void AMDGPUOpenMPToolChain::AddClangCXXStdlibIncludeArgs(
+const llvm::opt::ArgList &Args, llvm::opt::ArgStringList &CC1Args) const {
+  HostTC.AddClangCXXStdlibIncludeArgs(Args, CC1Args);
+}
+
 void AMDGPUOpenMPToolChain::AddClangSystemIncludeArgs(
 const ArgList &DriverArgs, ArgStringList &CC1Args) const {
   HostTC.AddClangSystemIncludeArgs(DriverArgs, CC1Args);
diff --git a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.h 
b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.h
index 2be444a42c55fa..0536c9f7f564c8 100644
--- a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.h
+++ b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.h
@@ -42,6 +42,9 @@ class LLVM_LIBRARY_VISIBILITY AMDGPUOpenMPToolChain final
 Action::OffloadKind DeviceOffloadKind) const override;
   void addClangWarningOptions(llvm::opt::ArgStringList &CC1Args) const 
override;
   CXXStdlibType GetCXXStdlibType(const llvm::opt::ArgList &Args) const 
override;
+  void AddClangCXXStdlibIncludeArgs(
+  const llvm::opt::ArgList &Args,
+  llvm::opt::ArgStringList &CC1Args) const override;
   void
   AddClangSystemIncludeArgs(const llvm::opt::ArgList &DriverArgs,
 llvm::opt::ArgStringList &CC1Args) const override;

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang] Automatically link the `compiler-rt` for GPUs if present (PR #109152)

2024-09-18 Thread Joseph Huber via cfe-commits

jhuber6 wrote:

> I like this direction and I think it should be the right way. However, IMHO, 
> I think it needs discussion (and potentially an RFC).

Moving from the header to the definition in `compiler-rt` would warrant an RFC, 
this patch just automatically links something that exists and will only provide 
definitions if they are undefined. So the only effect this has right now is 
linking in a symbol that would otherwise be a linker error.

https://github.com/llvm/llvm-project/pull/109152
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang] Automatically link the `compiler-rt` for GPUs if present (PR #109152)

2024-09-18 Thread Joseph Huber via cfe-commits

jhuber6 wrote:

> I'm not sure about this. What does `compiler-rt` provide?

It's `clang_rt.builtins`. So, basically complex number multiplication / 
division, wide integer stuff (I think the backend handles i128 now though). We 
currently have a wrapper header that defined `__mulsc3` for the device even 
though it's supposed to come from the compiler-rt.

https://github.com/llvm/llvm-project/pull/109152
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang] Automatically link the `compiler-rt` for GPUs if present (PR #109152)

2024-09-18 Thread Joseph Huber via cfe-commits

https://github.com/jhuber6 created 
https://github.com/llvm/llvm-project/pull/109152

Summary:
This automically links `copmiler-rt` for offloading languages if it
exists in the resource directory.


>From b6f6cbf7e1819779eeece437daef5bfb9b2a8cd0 Mon Sep 17 00:00:00 2001
From: Joseph Huber 
Date: Wed, 18 Sep 2024 09:51:51 -0500
Subject: [PATCH] [Clang] Automatically link the `compiler-rt` for GPUs if
 present

Summary:
This automically links `copmiler-rt` for offloading languages if it
exists in the resource directory.
---
 clang/lib/Driver/ToolChains/Clang.cpp | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/clang/lib/Driver/ToolChains/Clang.cpp 
b/clang/lib/Driver/ToolChains/Clang.cpp
index c00df5f5bc729c..1e615214580134 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -9239,6 +9239,12 @@ void LinkerWrapper::ConstructJob(Compilation &C, const 
JobAction &JA,
 CmdArgs.push_back(Args.MakeArgString(
 "--device-linker=" + TC.getTripleString() + "=" + "-lm"));
   }
+  auto HasCompilerRT = getToolChain().getVFS().exists(
+  TC.getCompilerRT(Args, "builtins", ToolChain::FT_Static));
+  if (HasCompilerRT)
+CmdArgs.push_back(
+Args.MakeArgString("--device-linker=" + TC.getTripleString() + "=" 
+
+   "-lclang_rt.builtins"));
 });
   }
 

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [libcxx] [Clang] Do not implicitly link C libraries for the GPU targets (PR #109052)

2024-09-17 Thread Joseph Huber via cfe-commits

https://github.com/jhuber6 created 
https://github.com/llvm/llvm-project/pull/109052

Summary:
I initially thought that it would be convenient to automatically link
these libraries like they are for standard C/C++ targets. However, this
created issues when trying to use C++ as a GPU target. This patch moves
the logic to now implicitly pass it as part of the offloading toolchain
instead, if found. This means that the user needs to set the target
toolchain for the link job for automatic detection, but can still be
done manually via `-Xoffload-linker -lc`.


>From 657eed1af8f9d0dfce8727d090340f98e8abaaa7 Mon Sep 17 00:00:00 2001
From: Joseph Huber 
Date: Tue, 17 Sep 2024 16:10:25 -0500
Subject: [PATCH] [Clang] Do not implicitly link C libraries for the GPU
 targets

Summary:
I initially thought that it would be convenient to automatically link
these libraries like they are for standard C/C++ targets. However, this
created issues when trying to use C++ as a GPU target. This patch moves
the logic to now implicitly pass it as part of the offloading toolchain
instead, if found. This means that the user needs to set the target
toolchain for the link job for automatic detection, but can still be
done manually via `-Xoffload-linker -lc`.
---
 clang/lib/Driver/ToolChains/AMDGPU.cpp|  4 +---
 clang/lib/Driver/ToolChains/Clang.cpp | 19 +++
 clang/lib/Driver/ToolChains/CommonArgs.cpp| 16 
 clang/lib/Driver/ToolChains/CommonArgs.h  |  3 ---
 clang/lib/Driver/ToolChains/Cuda.cpp  |  2 --
 clang/test/Driver/openmp-offload-gpu.c|  2 +-
 .../modules/prepare_libc_gpu_build.cmake  |  4 ++--
 libcxx/cmake/caches/AMDGPU.cmake  |  2 +-
 libcxx/cmake/caches/NVPTX.cmake   |  2 +-
 9 files changed, 25 insertions(+), 29 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp 
b/clang/lib/Driver/ToolChains/AMDGPU.cpp
index 74f70573c5feb8..2c85d21ebd738c 100644
--- a/clang/lib/Driver/ToolChains/AMDGPU.cpp
+++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp
@@ -648,8 +648,6 @@ void amdgpu::Linker::ConstructJob(Compilation &C, const 
JobAction &JA,
 Args.MakeArgString("-plugin-opt=-mattr=" + llvm::join(Features, ",")));
   }
 
-  addGPULibraries(getToolChain(), Args, CmdArgs);
-
   CmdArgs.push_back("-o");
   CmdArgs.push_back(Output.getFilename());
   C.addCommand(std::make_unique(
@@ -1089,4 +1087,4 @@ bool AMDGPUToolChain::shouldSkipSanitizeOption(
 return true;
   }
   return false;
-}
\ No newline at end of file
+}
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp 
b/clang/lib/Driver/ToolChains/Clang.cpp
index 3fe4ce5d893b8d..196330e74e8392 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -9223,6 +9223,25 @@ void LinkerWrapper::ConstructJob(Compilation &C, const 
JobAction &JA,
 A->claim();
   }
 
+  // Pass in the C library for GPUs if present and not disabled.
+  if (!Args.hasArg(options::OPT_nostdlib, options::OPT_r, 
options::OPT_nogpulib,
+   options::OPT_nodefaultlibs, options::OPT_nolibc,
+   options::OPT_nogpulibc)) {
+forAllAssociatedToolChains(C, JA, getToolChain(), [&](const ToolChain &TC) 
{
+  // The device C library is only available for NVPTX and AMDGPU targets
+  // currently.
+  if (!TC.getTriple().isNVPTX() && !TC.getTriple().isAMDGPU())
+return;
+  bool HasLibC = TC.getStdlibIncludePath().has_value();
+  if (HasLibC) {
+CmdArgs.push_back(Args.MakeArgString(
+"--device-linker=" + TC.getTripleString() + "=" + "-lc"));
+CmdArgs.push_back(Args.MakeArgString(
+"--device-linker=" + TC.getTripleString() + "=" + "-lm"));
+  }
+});
+  }
+
   // If we disable the GPU C library support it needs to be forwarded to the
   // link job.
   if (!Args.hasFlag(options::OPT_gpulibc, options::OPT_nogpulibc, true))
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp 
b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 502aba2ce4aa9c..043d9e48764439 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -510,22 +510,6 @@ void tools::addLinkerCompressDebugSectionsOption(
   }
 }
 
-void tools::addGPULibraries(const ToolChain &TC, const llvm::opt::ArgList 
&Args,
-llvm::opt::ArgStringList &CmdArgs) {
-  if (Args.hasArg(options::OPT_nostdlib, options::OPT_r,
-  options::OPT_nodefaultlibs, options::OPT_nolibc,
-  options::OPT_nogpulibc))
-return;
-
-  // If the user's toolchain has the 'include//` path, we assume it
-  // supports the standard C libraries for the GPU and include them.
-  bool HasLibC = TC.getStdlibIncludePath().has_value();
-  if (HasLibC) {
-CmdArgs.push_back("-lc");
-CmdArgs.push_back("-lm");
-  }
-}
-
 void tools::AddTargetFeature(const ArgList &Args,
  std::vector &Features,
  

[clang] 0f723eb - [Clang] Add locale variants to libc offload wrappers

2024-09-16 Thread Joseph Huber via cfe-commits

Author: Joseph Huber
Date: 2024-09-16T09:57:47-05:00
New Revision: 0f723eb67197421caf6504a7e4594751040b1924

URL: 
https://github.com/llvm/llvm-project/commit/0f723eb67197421caf6504a7e4594751040b1924
DIFF: 
https://github.com/llvm/llvm-project/commit/0f723eb67197421caf6504a7e4594751040b1924.diff

LOG: [Clang] Add locale variants to libc offload wrappers

Summary:
These need to be present now that the GPU "supports" them (only for the
default POSIX locale).

Added: 


Modified: 
clang/lib/Headers/llvm_libc_wrappers/ctype.h

Removed: 




diff  --git a/clang/lib/Headers/llvm_libc_wrappers/ctype.h 
b/clang/lib/Headers/llvm_libc_wrappers/ctype.h
index 49c2af93471b0e..960cf43302c4c9 100644
--- a/clang/lib/Headers/llvm_libc_wrappers/ctype.h
+++ b/clang/lib/Headers/llvm_libc_wrappers/ctype.h
@@ -51,6 +51,19 @@
 #pragma push_macro("toascii")
 #pragma push_macro("tolower")
 #pragma push_macro("toupper")
+#pragma push_macro("isalnum_l")
+#pragma push_macro("isalpha_l")
+#pragma push_macro("isascii_l")
+#pragma push_macro("isblank_l")
+#pragma push_macro("iscntrl_l")
+#pragma push_macro("isdigit_l")
+#pragma push_macro("isgraph_l")
+#pragma push_macro("islower_l")
+#pragma push_macro("isprint_l")
+#pragma push_macro("ispunct_l")
+#pragma push_macro("isspace_l")
+#pragma push_macro("isupper_l")
+#pragma push_macro("isxdigit_l")
 
 #undef isalnum
 #undef isalpha
@@ -68,6 +81,18 @@
 #undef toascii
 #undef tolower
 #undef toupper
+#undef isalnum_l
+#undef isalpha_l
+#undef iscntrl_l
+#undef isdigit_l
+#undef islower_l
+#undef isgraph_l
+#undef isprint_l
+#undef ispunct_l
+#undef isspace_l
+#undef isupper_l
+#undef isblank_l
+#undef isxdigit_l
 
 #pragma omp begin declare target
 
@@ -93,6 +118,19 @@
 #pragma pop_macro("toascii")
 #pragma pop_macro("tolower")
 #pragma pop_macro("toupper")
+#pragma pop_macro("isalnum_l")
+#pragma pop_macro("isalpha_l")
+#pragma pop_macro("isascii_l")
+#pragma pop_macro("isblank_l")
+#pragma pop_macro("iscntrl_l")
+#pragma pop_macro("isdigit_l")
+#pragma pop_macro("isgraph_l")
+#pragma pop_macro("islower_l")
+#pragma pop_macro("isprint_l")
+#pragma pop_macro("ispunct_l")
+#pragma pop_macro("isspace_l")
+#pragma pop_macro("isupper_l")
+#pragma pop_macro("isxdigit_l")
 #endif
 
 #undef __LIBC_ATTRS



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [Offload] Move HIP and CUDA to new driver by default (PR #84420)

2024-09-04 Thread Joseph Huber via cfe-commits

https://github.com/jhuber6 updated 
https://github.com/llvm/llvm-project/pull/84420

>From ac95806fa58d919a21f3724b10ba0634ff381d18 Mon Sep 17 00:00:00 2001
From: Joseph Huber 
Date: Thu, 7 Mar 2024 15:48:00 -0600
Subject: [PATCH] [Offload] Move HIP and CUDA to new driver by default

Summary:
This patch updates the `--offload-new-driver` flag to be default for all
current offloading languages. This mostly just required updating a lot
of tests to use the old format. I tried to update them where possible,
but some were directly checking the old format.

This is not intended to be landed immediately, but to allow for greater
testing. One potential issue I've discovered is the lack of SPIR-V
support or handling for `--offload`.
---
 clang/lib/Driver/Driver.cpp   |  8 +++---
 clang/lib/Driver/ToolChains/Clang.cpp | 19 ++
 clang/test/Driver/cl-offload.cu   |  5 ++--
 clang/test/Driver/cuda-arch-translation.cu| 26 +--
 clang/test/Driver/cuda-bindings.cu| 24 -
 clang/test/Driver/cuda-options.cu | 23 
 clang/test/Driver/cuda-output-asm.cu  |  4 ---
 clang/test/Driver/hip-gz-options.hip  |  1 -
 clang/test/Driver/hip-invalid-target-id.hip   |  4 +--
 clang/test/Driver/hip-macros.hip  |  3 ---
 clang/test/Driver/hip-offload-arch.hip|  4 +--
 clang/test/Driver/hip-options.hip |  6 +
 clang/test/Driver/hip-sanitize-options.hip|  2 +-
 clang/test/Driver/hip-save-temps.hip  | 12 -
 .../test/Driver/hip-toolchain-device-only.hip |  4 ---
 clang/test/Driver/hip-toolchain-mllvm.hip |  2 --
 clang/test/Driver/invalid-offload-options.cpp |  2 +-
 .../ClangLinkerWrapper.cpp|  9 +--
 clang/unittests/Tooling/ToolingTest.cpp   |  6 ++---
 llvm/lib/Object/OffloadBinary.cpp | 13 +++---
 20 files changed, 82 insertions(+), 95 deletions(-)

diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 5b3783e20eabba..e39baebe42bebb 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -4155,11 +4155,9 @@ void Driver::BuildActions(Compilation &C, DerivedArgList 
&Args,
   handleArguments(C, Args, Inputs, Actions);
 
   bool UseNewOffloadingDriver =
-  C.isOffloadingHostKind(Action::OFK_OpenMP) ||
-  Args.hasFlag(options::OPT_foffload_via_llvm,
-   options::OPT_fno_offload_via_llvm, false) ||
+  C.getActiveOffloadKinds() != Action::OFK_None &&
   Args.hasFlag(options::OPT_offload_new_driver,
-   options::OPT_no_offload_new_driver, false);
+   options::OPT_no_offload_new_driver, true);
 
   // Builder to be used to build offloading actions.
   std::unique_ptr OffloadBuilder =
@@ -4880,7 +4878,7 @@ Action *Driver::ConstructPhaseAction(
offloadDeviceOnly() ||
(TargetDeviceOffloadKind == Action::OFK_HIP &&
 !Args.hasFlag(options::OPT_offload_new_driver,
-  options::OPT_no_offload_new_driver, false)))
+  options::OPT_no_offload_new_driver, true)))
   ? types::TY_LLVM_IR
   : types::TY_LLVM_BC;
   return C.MakeAction(Input, Output);
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp 
b/clang/lib/Driver/ToolChains/Clang.cpp
index df86941950e46e..ba09ae56f18acc 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -4981,8 +4981,9 @@ void Clang::ConstructJob(Compilation &C, const JobAction 
&JA,
   bool IsHostOffloadingAction =
   JA.isHostOffloading(Action::OFK_OpenMP) ||
   (JA.isHostOffloading(C.getActiveOffloadKinds()) &&
+   C.getActiveOffloadKinds() != Action::OFK_None &&
Args.hasFlag(options::OPT_offload_new_driver,
-options::OPT_no_offload_new_driver, false));
+options::OPT_no_offload_new_driver, true));
 
   bool IsRDCMode =
   Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false);
@@ -5309,7 +5310,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction 
&JA,
 if (IsUsingLTO) {
   if (IsDeviceOffloadAction && !JA.isDeviceOffloading(Action::OFK_OpenMP) 
&&
   !Args.hasFlag(options::OPT_offload_new_driver,
-options::OPT_no_offload_new_driver, false) &&
+options::OPT_no_offload_new_driver, true) &&
   !Triple.isAMDGPU()) {
 D.Diag(diag::err_drv_unsupported_opt_for_target)
 << Args.getLastArg(options::OPT_foffload_lto,
@@ -6774,16 +6775,12 @@ void Clang::ConstructJob(Compilation &C, const 
JobAction &JA,
 Args.addOptOutFlag(CmdArgs, options::OPT_fopenmp_extensions,
options::OPT_fno_openmp_extensions);
   }
-  // Forward the offload runtime change to code generation, liboffload implies
-  // new driver. Oth

[clang] [llvm] [Offload] Move HIP and CUDA to new driver by default (PR #84420)

2024-08-28 Thread Joseph Huber via cfe-commits

jhuber6 wrote:

@yxsamliu Do you know what the next steps for merging this would be? I'd like 
to get it into the Clang 20 release if possible. The only thing this loses 
currently is managed variables being registered in RDC mode, but I'm going to 
assume that's hardly seen in practice so I could probably punt that until 
later. I unfortunately haven't figured out a way to reproduce the build 
failures on rocBLAS that the fork saw. I think @saiislam was looking into that 
but couldn't get docker to work.

https://github.com/llvm/llvm-project/pull/84420
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [Offload] Move HIP and CUDA to new driver by default (PR #84420)

2024-08-28 Thread Joseph Huber via cfe-commits

https://github.com/jhuber6 updated 
https://github.com/llvm/llvm-project/pull/84420

>From 1d8acaceb01e937eb4f9ff0a205711f782cfe3da Mon Sep 17 00:00:00 2001
From: Joseph Huber 
Date: Thu, 7 Mar 2024 15:48:00 -0600
Subject: [PATCH] [Offload] Move HIP and CUDA to new driver by default

Summary:
This patch updates the `--offload-new-driver` flag to be default for all
current offloading languages. This mostly just required updating a lot
of tests to use the old format. I tried to update them where possible,
but some were directly checking the old format.

This is not intended to be landed immediately, but to allow for greater
testing. One potential issue I've discovered is the lack of SPIR-V
support or handling for `--offload`.
---
 clang/lib/Driver/Driver.cpp   |  8 +++---
 clang/lib/Driver/ToolChains/Clang.cpp | 19 ++
 clang/test/Driver/cl-offload.cu   |  5 ++--
 clang/test/Driver/cuda-arch-translation.cu| 26 +--
 clang/test/Driver/cuda-bindings.cu| 24 -
 clang/test/Driver/cuda-options.cu | 23 
 clang/test/Driver/cuda-output-asm.cu  |  4 ---
 clang/test/Driver/hip-gz-options.hip  |  1 -
 clang/test/Driver/hip-invalid-target-id.hip   |  4 +--
 clang/test/Driver/hip-macros.hip  |  3 ---
 clang/test/Driver/hip-offload-arch.hip|  4 +--
 clang/test/Driver/hip-options.hip |  6 +
 clang/test/Driver/hip-sanitize-options.hip|  2 +-
 clang/test/Driver/hip-save-temps.hip  | 12 -
 .../test/Driver/hip-toolchain-device-only.hip |  4 ---
 clang/test/Driver/hip-toolchain-mllvm.hip |  2 --
 clang/test/Driver/invalid-offload-options.cpp |  2 +-
 .../ClangLinkerWrapper.cpp|  9 +--
 clang/unittests/Tooling/ToolingTest.cpp   |  6 ++---
 llvm/lib/Object/OffloadBinary.cpp | 13 +++---
 20 files changed, 82 insertions(+), 95 deletions(-)

diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 43002add33774b..ea9ffc36f22b85 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -4154,11 +4154,9 @@ void Driver::BuildActions(Compilation &C, DerivedArgList 
&Args,
   handleArguments(C, Args, Inputs, Actions);
 
   bool UseNewOffloadingDriver =
-  C.isOffloadingHostKind(Action::OFK_OpenMP) ||
-  Args.hasFlag(options::OPT_foffload_via_llvm,
-   options::OPT_fno_offload_via_llvm, false) ||
+  C.getActiveOffloadKinds() != Action::OFK_None &&
   Args.hasFlag(options::OPT_offload_new_driver,
-   options::OPT_no_offload_new_driver, false);
+   options::OPT_no_offload_new_driver, true);
 
   // Builder to be used to build offloading actions.
   std::unique_ptr OffloadBuilder =
@@ -4879,7 +4877,7 @@ Action *Driver::ConstructPhaseAction(
offloadDeviceOnly() ||
(TargetDeviceOffloadKind == Action::OFK_HIP &&
 !Args.hasFlag(options::OPT_offload_new_driver,
-  options::OPT_no_offload_new_driver, false)))
+  options::OPT_no_offload_new_driver, true)))
   ? types::TY_LLVM_IR
   : types::TY_LLVM_BC;
   return C.MakeAction(Input, Output);
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp 
b/clang/lib/Driver/ToolChains/Clang.cpp
index df86941950e46e..ba09ae56f18acc 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -4981,8 +4981,9 @@ void Clang::ConstructJob(Compilation &C, const JobAction 
&JA,
   bool IsHostOffloadingAction =
   JA.isHostOffloading(Action::OFK_OpenMP) ||
   (JA.isHostOffloading(C.getActiveOffloadKinds()) &&
+   C.getActiveOffloadKinds() != Action::OFK_None &&
Args.hasFlag(options::OPT_offload_new_driver,
-options::OPT_no_offload_new_driver, false));
+options::OPT_no_offload_new_driver, true));
 
   bool IsRDCMode =
   Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false);
@@ -5309,7 +5310,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction 
&JA,
 if (IsUsingLTO) {
   if (IsDeviceOffloadAction && !JA.isDeviceOffloading(Action::OFK_OpenMP) 
&&
   !Args.hasFlag(options::OPT_offload_new_driver,
-options::OPT_no_offload_new_driver, false) &&
+options::OPT_no_offload_new_driver, true) &&
   !Triple.isAMDGPU()) {
 D.Diag(diag::err_drv_unsupported_opt_for_target)
 << Args.getLastArg(options::OPT_foffload_lto,
@@ -6774,16 +6775,12 @@ void Clang::ConstructJob(Compilation &C, const 
JobAction &JA,
 Args.addOptOutFlag(CmdArgs, options::OPT_fopenmp_extensions,
options::OPT_fno_openmp_extensions);
   }
-  // Forward the offload runtime change to code generation, liboffload implies
-  // new driver. Oth

[clang] [AMDGPU] Use the AMDGPUToolChain when targeting C/C++ directly (PR #99687)

2024-08-27 Thread Joseph Huber via cfe-commits

jhuber6 wrote:

ping

https://github.com/llvm/llvm-project/pull/99687
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [X86][AVX10.2] Support AVX10.2-CONVERT new instructions. (PR #101600)

2024-08-21 Thread Joseph Huber via cfe-commits

jhuber6 wrote:

This is all it takes to reproduce this on my machine so I'm going to revert. 
x64 Linux, CPU is `znver3`.
```console
echo "#include " | ./bin/clang++ -x c++ -
```

https://github.com/llvm/llvm-project/pull/101600
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [X86][AVX10.2] Support AVX10.2-CONVERT new instructions. (PR #101600)

2024-08-21 Thread Joseph Huber via cfe-commits

jhuber6 wrote:

My guess is that you added these as a target dependent builtin, but made the 
headers always reference them.

https://github.com/llvm/llvm-project/pull/101600
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [X86][AVX10.2] Support AVX10.2-CONVERT new instructions. (PR #101600)

2024-08-21 Thread Joseph Huber via cfe-commits

jhuber6 wrote:

```llvm/llvm-project/build/lib/clang/20/include/avx10_2convertintrin.h:29:19: 
error: use of undeclared identifier '__builtin_ia32_vcvt2ps2phx128_mask'
   29 |   return (__m128h)__builtin_ia32_vcvt2ps2phx128_mask(
  |   ^
```
I'm getting a lot of errors like this since this patch landed.

https://github.com/llvm/llvm-project/pull/101600
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [OpenMP] Map `omp_default_mem_alloc` to global memory (PR #104790)

2024-08-20 Thread Joseph Huber via cfe-commits

https://github.com/jhuber6 closed 
https://github.com/llvm/llvm-project/pull/104790
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [OpenMP] Map `omp_default_mem_alloc` to global memory (PR #104790)

2024-08-19 Thread Joseph Huber via cfe-commits

https://github.com/jhuber6 updated 
https://github.com/llvm/llvm-project/pull/104790

>From eaa00ef74500833f280405c824d0282862c87b11 Mon Sep 17 00:00:00 2001
From: Joseph Huber 
Date: Mon, 19 Aug 2024 09:44:37 -0500
Subject: [PATCH 1/2] [OpenMP] Map `omp_default_mem_alloc` to global memory

Summary:
Currently, we assign this to private memory. This causes failures on
some SOLLVE tests. The standard isn't clear on the semantics of this
allocation type, but there seems to be a consensus that it's supposed to
be shared memory.
---
 clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp | 16 +---
 clang/test/OpenMP/nvptx_allocate_codegen.cpp | 10 --
 offload/test/api/omp_device_alloc.c  | 16 +++-
 3 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp 
b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 8965a14d88a6fb..77038b0f8ddc7b 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -2048,15 +2048,15 @@ Address 
CGOpenMPRuntimeGPU::getAddressOfLocalVariable(CodeGenFunction &CGF,
 const auto *A = VD->getAttr();
 auto AS = LangAS::Default;
 switch (A->getAllocatorType()) {
-  // Use the default allocator here as by default local vars are
-  // threadlocal.
 case OMPAllocateDeclAttr::OMPNullMemAlloc:
 case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
-case OMPAllocateDeclAttr::OMPThreadMemAlloc:
 case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
 case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
-  // Follow the user decision - use default allocation.
-  return Address::invalid();
+  AS = LangAS::opencl_global;
+  break;
+case OMPAllocateDeclAttr::OMPThreadMemAlloc:
+  AS = LangAS::opencl_private;
+  break;
 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
   // TODO: implement aupport for user-defined allocators.
   return Address::invalid();
@@ -2208,12 +2208,14 @@ bool 
CGOpenMPRuntimeGPU::hasAllocateAttributeForGlobalVar(const VarDecl *VD,
   case OMPAllocateDeclAttr::OMPNullMemAlloc:
   case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
   // Not supported, fallback to the default mem space.
-  case OMPAllocateDeclAttr::OMPThreadMemAlloc:
   case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
   case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
   case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
   case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
-AS = LangAS::Default;
+AS = LangAS::opencl_global;
+return true;
+  case OMPAllocateDeclAttr::OMPThreadMemAlloc:
+AS = LangAS::opencl_private;
 return true;
   case OMPAllocateDeclAttr::OMPConstMemAlloc:
 AS = LangAS::cuda_constant;
diff --git a/clang/test/OpenMP/nvptx_allocate_codegen.cpp 
b/clang/test/OpenMP/nvptx_allocate_codegen.cpp
index 3f3457dab33c2d..f4bd2458c3d17d 100644
--- a/clang/test/OpenMP/nvptx_allocate_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_allocate_codegen.cpp
@@ -87,10 +87,9 @@ void bar() {
 // CHECK1-SAME: () #[[ATTR0:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:[[RETVAL:%.*]] = alloca i32, align 4
-// CHECK1-NEXT:[[B:%.*]] = alloca double, align 8
 // CHECK1-NEXT:store i32 0, ptr [[RETVAL]], align 4
 // CHECK1-NEXT:store i32 2, ptr @_ZZ4mainE1a, align 4
-// CHECK1-NEXT:store double 3.00e+00, ptr [[B]], align 8
+// CHECK1-NEXT:store double 3.00e+00, ptr addrspacecast (ptr 
addrspace(1) @b1 to ptr), align 8
 // CHECK1-NEXT:[[CALL:%.*]] = call noundef i32 @_Z3fooIiET_v() 
#[[ATTR7:[0-9]+]]
 // CHECK1-NEXT:ret i32 [[CALL]]
 //
@@ -98,7 +97,7 @@ void bar() {
 // CHECK1-LABEL: define {{[^@]+}}@_Z3fooIiET_v
 // CHECK1-SAME: () #[[ATTR1:[0-9]+]] comdat {
 // CHECK1-NEXT:  entry:
-// CHECK1-NEXT:[[TMP0:%.*]] = load i32, ptr @_ZN2STIiE1mE, align 4
+// CHECK1-NEXT:[[TMP0:%.*]] = load i32, ptr addrspacecast (ptr 
addrspace(1) @_ZN2STIiE1mE to ptr), align 4
 // CHECK1-NEXT:store i32 [[TMP0]], ptr @v, align 4
 // CHECK1-NEXT:[[TMP1:%.*]] = load i32, ptr @v, align 4
 // CHECK1-NEXT:ret i32 [[TMP1]]
@@ -120,13 +119,12 @@ void bar() {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:[[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:[[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:[[BAR_A:%.*]] = alloca float, align 4
 // CHECK1-NEXT:store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], 
align 8
 // CHECK1-NEXT:store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], 
align 8
-// CHECK1-NEXT:[[TMP0:%.*]] = load float, ptr [[BAR_A]], align 4
+// CHECK1-NEXT:[[TMP0:%.*]] = load float, ptr @bar_a, align 4
 // CHECK1-NEXT:[[CONV:%.*]] = fpext float [[TMP0]] to double
 // CHECK1-NEXT:store double [[CONV]], ptr addrspacecast (ptr addrspace(3) 
@bar_b to ptr), align 8
-// CHECK1-NEXT:call void @_Z3bazRf(ptr noundef nonnull align 4 
dereferenceable(4) [[BAR_A]]) #[[ATTR7]]
+// CHECK1-NEXT:call void @_Z3bazRf(ptr noundef nonnull align

[clang] [llvm] [OpenMP] Map `omp_default_mem_alloc` to global memory (PR #104790)

2024-08-19 Thread Joseph Huber via cfe-commits

https://github.com/jhuber6 updated 
https://github.com/llvm/llvm-project/pull/104790

>From eaa00ef74500833f280405c824d0282862c87b11 Mon Sep 17 00:00:00 2001
From: Joseph Huber 
Date: Mon, 19 Aug 2024 09:44:37 -0500
Subject: [PATCH 1/2] [OpenMP] Map `omp_default_mem_alloc` to global memory

Summary:
Currently, we assign this to private memory. This causes failures on
some SOLLVE tests. The standard isn't clear on the semantics of this
allocation type, but there seems to be a consensus that it's supposed to
be shared memory.
---
 clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp | 16 +---
 clang/test/OpenMP/nvptx_allocate_codegen.cpp | 10 --
 offload/test/api/omp_device_alloc.c  | 16 +++-
 3 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp 
b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 8965a14d88a6fb..77038b0f8ddc7b 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -2048,15 +2048,15 @@ Address 
CGOpenMPRuntimeGPU::getAddressOfLocalVariable(CodeGenFunction &CGF,
 const auto *A = VD->getAttr();
 auto AS = LangAS::Default;
 switch (A->getAllocatorType()) {
-  // Use the default allocator here as by default local vars are
-  // threadlocal.
 case OMPAllocateDeclAttr::OMPNullMemAlloc:
 case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
-case OMPAllocateDeclAttr::OMPThreadMemAlloc:
 case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
 case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
-  // Follow the user decision - use default allocation.
-  return Address::invalid();
+  AS = LangAS::opencl_global;
+  break;
+case OMPAllocateDeclAttr::OMPThreadMemAlloc:
+  AS = LangAS::opencl_private;
+  break;
 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
   // TODO: implement aupport for user-defined allocators.
   return Address::invalid();
@@ -2208,12 +2208,14 @@ bool 
CGOpenMPRuntimeGPU::hasAllocateAttributeForGlobalVar(const VarDecl *VD,
   case OMPAllocateDeclAttr::OMPNullMemAlloc:
   case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
   // Not supported, fallback to the default mem space.
-  case OMPAllocateDeclAttr::OMPThreadMemAlloc:
   case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
   case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
   case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
   case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
-AS = LangAS::Default;
+AS = LangAS::opencl_global;
+return true;
+  case OMPAllocateDeclAttr::OMPThreadMemAlloc:
+AS = LangAS::opencl_private;
 return true;
   case OMPAllocateDeclAttr::OMPConstMemAlloc:
 AS = LangAS::cuda_constant;
diff --git a/clang/test/OpenMP/nvptx_allocate_codegen.cpp 
b/clang/test/OpenMP/nvptx_allocate_codegen.cpp
index 3f3457dab33c2d..f4bd2458c3d17d 100644
--- a/clang/test/OpenMP/nvptx_allocate_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_allocate_codegen.cpp
@@ -87,10 +87,9 @@ void bar() {
 // CHECK1-SAME: () #[[ATTR0:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:[[RETVAL:%.*]] = alloca i32, align 4
-// CHECK1-NEXT:[[B:%.*]] = alloca double, align 8
 // CHECK1-NEXT:store i32 0, ptr [[RETVAL]], align 4
 // CHECK1-NEXT:store i32 2, ptr @_ZZ4mainE1a, align 4
-// CHECK1-NEXT:store double 3.00e+00, ptr [[B]], align 8
+// CHECK1-NEXT:store double 3.00e+00, ptr addrspacecast (ptr 
addrspace(1) @b1 to ptr), align 8
 // CHECK1-NEXT:[[CALL:%.*]] = call noundef i32 @_Z3fooIiET_v() 
#[[ATTR7:[0-9]+]]
 // CHECK1-NEXT:ret i32 [[CALL]]
 //
@@ -98,7 +97,7 @@ void bar() {
 // CHECK1-LABEL: define {{[^@]+}}@_Z3fooIiET_v
 // CHECK1-SAME: () #[[ATTR1:[0-9]+]] comdat {
 // CHECK1-NEXT:  entry:
-// CHECK1-NEXT:[[TMP0:%.*]] = load i32, ptr @_ZN2STIiE1mE, align 4
+// CHECK1-NEXT:[[TMP0:%.*]] = load i32, ptr addrspacecast (ptr 
addrspace(1) @_ZN2STIiE1mE to ptr), align 4
 // CHECK1-NEXT:store i32 [[TMP0]], ptr @v, align 4
 // CHECK1-NEXT:[[TMP1:%.*]] = load i32, ptr @v, align 4
 // CHECK1-NEXT:ret i32 [[TMP1]]
@@ -120,13 +119,12 @@ void bar() {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:[[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:[[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:[[BAR_A:%.*]] = alloca float, align 4
 // CHECK1-NEXT:store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], 
align 8
 // CHECK1-NEXT:store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], 
align 8
-// CHECK1-NEXT:[[TMP0:%.*]] = load float, ptr [[BAR_A]], align 4
+// CHECK1-NEXT:[[TMP0:%.*]] = load float, ptr @bar_a, align 4
 // CHECK1-NEXT:[[CONV:%.*]] = fpext float [[TMP0]] to double
 // CHECK1-NEXT:store double [[CONV]], ptr addrspacecast (ptr addrspace(3) 
@bar_b to ptr), align 8
-// CHECK1-NEXT:call void @_Z3bazRf(ptr noundef nonnull align 4 
dereferenceable(4) [[BAR_A]]) #[[ATTR7]]
+// CHECK1-NEXT:call void @_Z3bazRf(ptr noundef nonnull align

[clang] [llvm] [OpenMP] Map `omp_default_mem_alloc` to global memory (PR #104790)

2024-08-19 Thread Joseph Huber via cfe-commits

https://github.com/jhuber6 updated 
https://github.com/llvm/llvm-project/pull/104790

>From eaa00ef74500833f280405c824d0282862c87b11 Mon Sep 17 00:00:00 2001
From: Joseph Huber 
Date: Mon, 19 Aug 2024 09:44:37 -0500
Subject: [PATCH] [OpenMP] Map `omp_default_mem_alloc` to global memory

Summary:
Currently, we assign this to private memory. This causes failures on
some SOLLVE tests. The standard isn't clear on the semantics of this
allocation type, but there seems to be a consensus that it's supposed to
be shared memory.
---
 clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp | 16 +---
 clang/test/OpenMP/nvptx_allocate_codegen.cpp | 10 --
 offload/test/api/omp_device_alloc.c  | 16 +++-
 3 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp 
b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 8965a14d88a6fb..77038b0f8ddc7b 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -2048,15 +2048,15 @@ Address 
CGOpenMPRuntimeGPU::getAddressOfLocalVariable(CodeGenFunction &CGF,
 const auto *A = VD->getAttr();
 auto AS = LangAS::Default;
 switch (A->getAllocatorType()) {
-  // Use the default allocator here as by default local vars are
-  // threadlocal.
 case OMPAllocateDeclAttr::OMPNullMemAlloc:
 case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
-case OMPAllocateDeclAttr::OMPThreadMemAlloc:
 case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
 case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
-  // Follow the user decision - use default allocation.
-  return Address::invalid();
+  AS = LangAS::opencl_global;
+  break;
+case OMPAllocateDeclAttr::OMPThreadMemAlloc:
+  AS = LangAS::opencl_private;
+  break;
 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
   // TODO: implement aupport for user-defined allocators.
   return Address::invalid();
@@ -2208,12 +2208,14 @@ bool 
CGOpenMPRuntimeGPU::hasAllocateAttributeForGlobalVar(const VarDecl *VD,
   case OMPAllocateDeclAttr::OMPNullMemAlloc:
   case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
   // Not supported, fallback to the default mem space.
-  case OMPAllocateDeclAttr::OMPThreadMemAlloc:
   case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
   case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
   case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
   case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
-AS = LangAS::Default;
+AS = LangAS::opencl_global;
+return true;
+  case OMPAllocateDeclAttr::OMPThreadMemAlloc:
+AS = LangAS::opencl_private;
 return true;
   case OMPAllocateDeclAttr::OMPConstMemAlloc:
 AS = LangAS::cuda_constant;
diff --git a/clang/test/OpenMP/nvptx_allocate_codegen.cpp 
b/clang/test/OpenMP/nvptx_allocate_codegen.cpp
index 3f3457dab33c2d..f4bd2458c3d17d 100644
--- a/clang/test/OpenMP/nvptx_allocate_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_allocate_codegen.cpp
@@ -87,10 +87,9 @@ void bar() {
 // CHECK1-SAME: () #[[ATTR0:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:[[RETVAL:%.*]] = alloca i32, align 4
-// CHECK1-NEXT:[[B:%.*]] = alloca double, align 8
 // CHECK1-NEXT:store i32 0, ptr [[RETVAL]], align 4
 // CHECK1-NEXT:store i32 2, ptr @_ZZ4mainE1a, align 4
-// CHECK1-NEXT:store double 3.00e+00, ptr [[B]], align 8
+// CHECK1-NEXT:store double 3.00e+00, ptr addrspacecast (ptr 
addrspace(1) @b1 to ptr), align 8
 // CHECK1-NEXT:[[CALL:%.*]] = call noundef i32 @_Z3fooIiET_v() 
#[[ATTR7:[0-9]+]]
 // CHECK1-NEXT:ret i32 [[CALL]]
 //
@@ -98,7 +97,7 @@ void bar() {
 // CHECK1-LABEL: define {{[^@]+}}@_Z3fooIiET_v
 // CHECK1-SAME: () #[[ATTR1:[0-9]+]] comdat {
 // CHECK1-NEXT:  entry:
-// CHECK1-NEXT:[[TMP0:%.*]] = load i32, ptr @_ZN2STIiE1mE, align 4
+// CHECK1-NEXT:[[TMP0:%.*]] = load i32, ptr addrspacecast (ptr 
addrspace(1) @_ZN2STIiE1mE to ptr), align 4
 // CHECK1-NEXT:store i32 [[TMP0]], ptr @v, align 4
 // CHECK1-NEXT:[[TMP1:%.*]] = load i32, ptr @v, align 4
 // CHECK1-NEXT:ret i32 [[TMP1]]
@@ -120,13 +119,12 @@ void bar() {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:[[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:[[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:[[BAR_A:%.*]] = alloca float, align 4
 // CHECK1-NEXT:store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], 
align 8
 // CHECK1-NEXT:store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], 
align 8
-// CHECK1-NEXT:[[TMP0:%.*]] = load float, ptr [[BAR_A]], align 4
+// CHECK1-NEXT:[[TMP0:%.*]] = load float, ptr @bar_a, align 4
 // CHECK1-NEXT:[[CONV:%.*]] = fpext float [[TMP0]] to double
 // CHECK1-NEXT:store double [[CONV]], ptr addrspacecast (ptr addrspace(3) 
@bar_b to ptr), align 8
-// CHECK1-NEXT:call void @_Z3bazRf(ptr noundef nonnull align 4 
dereferenceable(4) [[BAR_A]]) #[[ATTR7]]
+// CHECK1-NEXT:call void @_Z3bazRf(ptr noundef nonnull align 4 

[clang] [llvm] [OpenMP] Map `omp_default_mem_alloc` to global memory (PR #104790)

2024-08-19 Thread Joseph Huber via cfe-commits

https://github.com/jhuber6 created 
https://github.com/llvm/llvm-project/pull/104790

Summary:
Currently, we assign this to private memory. This causes failures on
some SOLLVE tests. The standard isn't clear on the semantics of this
allocation type, but there seems to be a consensus that it's supposed to
be shared memory.


>From 0b3955d078356bb82a5f1d750d19e81001bc8807 Mon Sep 17 00:00:00 2001
From: Joseph Huber 
Date: Mon, 19 Aug 2024 09:44:37 -0500
Subject: [PATCH] [OpenMP] Map `omp_default_mem_alloc` to global memory

Summary:
Currently, we assign this to private memory. This causes failures on
some SOLLVE tests. The standard isn't clear on the semantics of this
allocation type, but there seems to be a consensus that it's supposed to
be shared memory.
---
 clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp | 16 +---
 clang/test/OpenMP/nvptx_allocate_codegen.cpp | 10 --
 offload/test/api/omp_device_alloc.c  | 16 +++-
 3 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp 
b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 8965a14d88a6fb..77038b0f8ddc7b 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -2048,15 +2048,15 @@ Address 
CGOpenMPRuntimeGPU::getAddressOfLocalVariable(CodeGenFunction &CGF,
 const auto *A = VD->getAttr();
 auto AS = LangAS::Default;
 switch (A->getAllocatorType()) {
-  // Use the default allocator here as by default local vars are
-  // threadlocal.
 case OMPAllocateDeclAttr::OMPNullMemAlloc:
 case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
-case OMPAllocateDeclAttr::OMPThreadMemAlloc:
 case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
 case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
-  // Follow the user decision - use default allocation.
-  return Address::invalid();
+  AS = LangAS::opencl_global;
+  break;
+case OMPAllocateDeclAttr::OMPThreadMemAlloc:
+  AS = LangAS::opencl_private;
+  break;
 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
   // TODO: implement aupport for user-defined allocators.
   return Address::invalid();
@@ -2208,12 +2208,14 @@ bool 
CGOpenMPRuntimeGPU::hasAllocateAttributeForGlobalVar(const VarDecl *VD,
   case OMPAllocateDeclAttr::OMPNullMemAlloc:
   case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
   // Not supported, fallback to the default mem space.
-  case OMPAllocateDeclAttr::OMPThreadMemAlloc:
   case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
   case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
   case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
   case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
-AS = LangAS::Default;
+AS = LangAS::opencl_global;
+return true;
+  case OMPAllocateDeclAttr::OMPThreadMemAlloc:
+AS = LangAS::opencl_private;
 return true;
   case OMPAllocateDeclAttr::OMPConstMemAlloc:
 AS = LangAS::cuda_constant;
diff --git a/clang/test/OpenMP/nvptx_allocate_codegen.cpp 
b/clang/test/OpenMP/nvptx_allocate_codegen.cpp
index 3f3457dab33c2d..f4bd2458c3d17d 100644
--- a/clang/test/OpenMP/nvptx_allocate_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_allocate_codegen.cpp
@@ -87,10 +87,9 @@ void bar() {
 // CHECK1-SAME: () #[[ATTR0:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:[[RETVAL:%.*]] = alloca i32, align 4
-// CHECK1-NEXT:[[B:%.*]] = alloca double, align 8
 // CHECK1-NEXT:store i32 0, ptr [[RETVAL]], align 4
 // CHECK1-NEXT:store i32 2, ptr @_ZZ4mainE1a, align 4
-// CHECK1-NEXT:store double 3.00e+00, ptr [[B]], align 8
+// CHECK1-NEXT:store double 3.00e+00, ptr addrspacecast (ptr 
addrspace(1) @b1 to ptr), align 8
 // CHECK1-NEXT:[[CALL:%.*]] = call noundef i32 @_Z3fooIiET_v() 
#[[ATTR7:[0-9]+]]
 // CHECK1-NEXT:ret i32 [[CALL]]
 //
@@ -98,7 +97,7 @@ void bar() {
 // CHECK1-LABEL: define {{[^@]+}}@_Z3fooIiET_v
 // CHECK1-SAME: () #[[ATTR1:[0-9]+]] comdat {
 // CHECK1-NEXT:  entry:
-// CHECK1-NEXT:[[TMP0:%.*]] = load i32, ptr @_ZN2STIiE1mE, align 4
+// CHECK1-NEXT:[[TMP0:%.*]] = load i32, ptr addrspacecast (ptr 
addrspace(1) @_ZN2STIiE1mE to ptr), align 4
 // CHECK1-NEXT:store i32 [[TMP0]], ptr @v, align 4
 // CHECK1-NEXT:[[TMP1:%.*]] = load i32, ptr @v, align 4
 // CHECK1-NEXT:ret i32 [[TMP1]]
@@ -120,13 +119,12 @@ void bar() {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:[[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:[[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:[[BAR_A:%.*]] = alloca float, align 4
 // CHECK1-NEXT:store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], 
align 8
 // CHECK1-NEXT:store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], 
align 8
-// CHECK1-NEXT:[[TMP0:%.*]] = load float, ptr [[BAR_A]], align 4
+// CHECK1-NEXT:[[TMP0:%.*]] = load float, ptr @bar_a, align 4
 // CHECK1-NEXT:[[CONV:%.*]] = fpext float [[TMP0]] to double
 // CHECK1-NEXT:store double [[CONV]], 

[clang] [Clang] Fix sema checks thinking kernels aren't kernels (PR #104460)

2024-08-16 Thread Joseph Huber via cfe-commits

https://github.com/jhuber6 closed 
https://github.com/llvm/llvm-project/pull/104460
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Convert AMDGPUResourceUsageAnalysis pass from Module to MF pass (PR #102913)

2024-08-16 Thread Joseph Huber via cfe-commits

jhuber6 wrote:

I applied this locally and it resolved 
https://github.com/llvm/llvm-project/issues/64863 so I'm looking forward to 
this landing.

https://github.com/llvm/llvm-project/pull/102913
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Convert AMDGPUResourceUsageAnalysis pass from Module to MF pass (PR #102913)

2024-08-16 Thread Joseph Huber via cfe-commits

https://github.com/jhuber6 edited 
https://github.com/llvm/llvm-project/pull/102913
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang] Fix sema checks thinking kernels aren't kernels (PR #104460)

2024-08-15 Thread Joseph Huber via cfe-commits


@@ -7163,24 +7163,27 @@ void Sema::ProcessDeclAttributeList(
 } else if (const auto *A = D->getAttr()) {
   Diag(D->getLocation(), diag::err_opencl_kernel_attr) << A;
   D->setInvalidDecl();
-} else if (!D->hasAttr()) {
-  if (const auto *A = D->getAttr()) {
-Diag(D->getLocation(), diag::err_attribute_wrong_decl_type)
-<< A << A->isRegularKeywordAttribute() << ExpectedKernelFunction;
-D->setInvalidDecl();
-  } else if (const auto *A = D->getAttr()) {
-Diag(D->getLocation(), diag::err_attribute_wrong_decl_type)
-<< A << A->isRegularKeywordAttribute() << ExpectedKernelFunction;
-D->setInvalidDecl();
-  } else if (const auto *A = D->getAttr()) {
-Diag(D->getLocation(), diag::err_attribute_wrong_decl_type)
-<< A << A->isRegularKeywordAttribute() << ExpectedKernelFunction;
-D->setInvalidDecl();
-  } else if (const auto *A = D->getAttr()) {
-Diag(D->getLocation(), diag::err_attribute_wrong_decl_type)
-<< A << A->isRegularKeywordAttribute() << ExpectedKernelFunction;
-D->setInvalidDecl();
-  }
+}
+  }
+  const FunctionType *FnTy = D->getFunctionType();
+  if (!D->hasAttr() && !D->hasAttr() &&
+  FnTy && FnTy->getCallConv() != CallingConv::CC_AMDGPUKernelCall) {
+if (const auto *A = D->getAttr()) {
+  Diag(D->getLocation(), diag::err_attribute_wrong_decl_type)
+  << A << A->isRegularKeywordAttribute() << ExpectedKernelFunction;
+  D->setInvalidDecl();
+} else if (const auto *A = D->getAttr()) {
+  Diag(D->getLocation(), diag::err_attribute_wrong_decl_type)
+  << A << A->isRegularKeywordAttribute() << ExpectedKernelFunction;
+  D->setInvalidDecl();
+} else if (const auto *A = D->getAttr()) {
+  Diag(D->getLocation(), diag::err_attribute_wrong_decl_type)
+  << A << A->isRegularKeywordAttribute() << ExpectedKernelFunction;
+  D->setInvalidDecl();
+} else if (const auto *A = D->getAttr()) {
+  Diag(D->getLocation(), diag::err_attribute_wrong_decl_type)
+  << A << A->isRegularKeywordAttribute() << ExpectedKernelFunction;
+  D->setInvalidDecl();

jhuber6 wrote:

I also might want to revisit the CUDA launch bounds attr. IIRC we had to 
duplicate a lot of the CUDA attrs since they all require the CUDA language 
unlike the AMD ones.

https://github.com/llvm/llvm-project/pull/104460
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [Offload] Provide a kernel library useable by the offload runtime (PR #104168)

2024-08-15 Thread Joseph Huber via cfe-commits


@@ -393,22 +393,17 @@ struct CUDADeviceTy : public GenericDeviceTy {
 return Plugin::success();
   }
 
-  virtual Error callGlobalConstructors(GenericPluginTy &Plugin,
-   DeviceImageTy &Image) override {
-// Check for the presense of global destructors at initialization time. 
This
-// is required when the image may be deallocated before destructors are 
run.
-GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
-if (Handler.isSymbolInImage(*this, Image, "nvptx$device$fini"))
-  Image.setPendingGlobalDtors();
-
-return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/true);
+  virtual Expected
+  getGlobalConstructorName(DeviceImageTy &Image) override {
+if (auto Err = prepareGlobalCtorDtorCommon(Image, /*IsCtor=*/true))
+  return Err;
+return "nvptx$device$init";
   }
-
-  virtual Error callGlobalDestructors(GenericPluginTy &Plugin,
-  DeviceImageTy &Image) override {
-if (Image.hasPendingGlobalDtors())
-  return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/false);
-return Plugin::success();
+  virtual Expected
+  getGlobalDestructorName(DeviceImageTy &Image) override {
+if (auto Err = prepareGlobalCtorDtorCommon(Image, /*IsCtor=*/false))

jhuber6 wrote:

What happened to the `pendingGlobalDtors` thing? That was required since some 
configs (i.e. JIT) don't have permanent storage so they could be deallocated 
during teardown. The better solution would be to make an API that copied in its 
own buffer (This is what hsa_executable_freeze is AFAIK).

https://github.com/llvm/llvm-project/pull/104168
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang] Fix sema checks thinking kernels aren't kernels (PR #104460)

2024-08-15 Thread Joseph Huber via cfe-commits


@@ -7163,24 +7163,27 @@ void Sema::ProcessDeclAttributeList(
 } else if (const auto *A = D->getAttr()) {
   Diag(D->getLocation(), diag::err_opencl_kernel_attr) << A;
   D->setInvalidDecl();
-} else if (!D->hasAttr()) {
-  if (const auto *A = D->getAttr()) {
-Diag(D->getLocation(), diag::err_attribute_wrong_decl_type)
-<< A << A->isRegularKeywordAttribute() << ExpectedKernelFunction;
-D->setInvalidDecl();
-  } else if (const auto *A = D->getAttr()) {
-Diag(D->getLocation(), diag::err_attribute_wrong_decl_type)
-<< A << A->isRegularKeywordAttribute() << ExpectedKernelFunction;
-D->setInvalidDecl();
-  } else if (const auto *A = D->getAttr()) {
-Diag(D->getLocation(), diag::err_attribute_wrong_decl_type)
-<< A << A->isRegularKeywordAttribute() << ExpectedKernelFunction;
-D->setInvalidDecl();
-  } else if (const auto *A = D->getAttr()) {
-Diag(D->getLocation(), diag::err_attribute_wrong_decl_type)
-<< A << A->isRegularKeywordAttribute() << ExpectedKernelFunction;
-D->setInvalidDecl();
-  }
+}
+  }
+  const FunctionType *FnTy = D->getFunctionType();
+  if (!D->hasAttr() && !D->hasAttr() &&
+  FnTy && FnTy->getCallConv() != CallingConv::CC_AMDGPUKernelCall) {
+if (const auto *A = D->getAttr()) {
+  Diag(D->getLocation(), diag::err_attribute_wrong_decl_type)
+  << A << A->isRegularKeywordAttribute() << ExpectedKernelFunction;
+  D->setInvalidDecl();
+} else if (const auto *A = D->getAttr()) {
+  Diag(D->getLocation(), diag::err_attribute_wrong_decl_type)
+  << A << A->isRegularKeywordAttribute() << ExpectedKernelFunction;
+  D->setInvalidDecl();
+} else if (const auto *A = D->getAttr()) {
+  Diag(D->getLocation(), diag::err_attribute_wrong_decl_type)
+  << A << A->isRegularKeywordAttribute() << ExpectedKernelFunction;
+  D->setInvalidDecl();
+} else if (const auto *A = D->getAttr()) {
+  Diag(D->getLocation(), diag::err_attribute_wrong_decl_type)
+  << A << A->isRegularKeywordAttribute() << ExpectedKernelFunction;
+  D->setInvalidDecl();

jhuber6 wrote:

Nevermind, fails in practice because to put it in a uniform container we need 
to erase them all to `Attr *` which then seems to cause some unfortunate side 
effects.

https://github.com/llvm/llvm-project/pull/104460
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang] Fix sema checks thinking kernels aren't kernels (PR #104460)

2024-08-15 Thread Joseph Huber via cfe-commits

https://github.com/jhuber6 updated 
https://github.com/llvm/llvm-project/pull/104460

>From 84c89feaea8135f7dfaba488b442818974b51c9d Mon Sep 17 00:00:00 2001
From: Joseph Huber 
Date: Thu, 15 Aug 2024 10:34:17 -0500
Subject: [PATCH 1/3] [Clang] Fix sema checks thinking kernels aren't kernels

Summary:
Currently we have some sema checks to make sure users don't apply
kernel-only attributes to non-kernel functions. However, this currently
did not correctly check for bare NVPTX / AMDGPU kernel attributes,
making it impossible to use them at all w/o CUDA enabled. This patch
fixes that by checking for the calling convention / attributes directly.
---
 clang/lib/Sema/SemaDeclAttr.cpp  | 7 +--
 clang/test/CodeGenCXX/amdgpu-kernel-arg-pointer-type.cpp | 5 +
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 3b5e984f4ee773..96c3759de7edc1 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -7147,7 +7147,9 @@ void Sema::ProcessDeclAttributeList(
   // good to have a way to specify "these attributes must appear as a group",
   // for these. Additionally, it would be good to have a way to specify "these
   // attribute must never appear as a group" for attributes like cold and hot.
-  if (!D->hasAttr()) {
+  const FunctionType *FnTy = D->getFunctionType();
+  if (!D->hasAttr() && FnTy &&
+  FnTy->getCallConv() != CallingConv::CC_AMDGPUKernelCall) {
 // These attributes cannot be applied to a non-kernel function.
 if (const auto *A = D->getAttr()) {
   // FIXME: This emits a different error message than
@@ -7163,7 +7165,8 @@ void Sema::ProcessDeclAttributeList(
 } else if (const auto *A = D->getAttr()) {
   Diag(D->getLocation(), diag::err_opencl_kernel_attr) << A;
   D->setInvalidDecl();
-} else if (!D->hasAttr()) {
+} else if (!D->hasAttr() &&
+   !D->hasAttr()) {
   if (const auto *A = D->getAttr()) {
 Diag(D->getLocation(), diag::err_attribute_wrong_decl_type)
 << A << A->isRegularKeywordAttribute() << ExpectedKernelFunction;
diff --git a/clang/test/CodeGenCXX/amdgpu-kernel-arg-pointer-type.cpp 
b/clang/test/CodeGenCXX/amdgpu-kernel-arg-pointer-type.cpp
index c8fb12315d0eec..8df1d75e9ec8fa 100644
--- a/clang/test/CodeGenCXX/amdgpu-kernel-arg-pointer-type.cpp
+++ b/clang/test/CodeGenCXX/amdgpu-kernel-arg-pointer-type.cpp
@@ -6,6 +6,10 @@
 // The original test passes the result through opt O2, but that seems to 
introduce invalid
 // addrspace casts which are not being fixed as part of the present change.
 
+// COMMON: define{{.*}} amdgpu_kernel void @_Z6kernelv() #[[ATTR:[0-9]+]]
+__attribute__((amdgpu_kernel, amdgpu_flat_work_group_size(1, 256))) void
+kernel() {}
+
 // COMMON-LABEL: define{{.*}} amdgpu_kernel void @_Z7kernel1Pi(ptr {{.*}} %x)
 // CHECK-NOT: ={{.*}} addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to ptr
 __attribute__((amdgpu_kernel)) void kernel1(int *x) {
@@ -81,3 +85,4 @@ __attribute__((amdgpu_kernel)) void kernel8(struct SS a) {
   *a.x += 3.f;
 }
 
+// COMMON: attributes #[[ATTR]] = { 
{{.*}}"amdgpu-flat-work-group-size"="1,256"{{.*}} }

>From 25012dc87e9cb694b74f5f5c1e4105cca9954e20 Mon Sep 17 00:00:00 2001
From: Joseph Huber 
Date: Thu, 15 Aug 2024 11:07:49 -0500
Subject: [PATCH 2/3] Address comments

---
 clang/lib/Sema/SemaDeclAttr.cpp | 44 -
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 96c3759de7edc1..c10fd87488b2b4 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -7147,9 +7147,7 @@ void Sema::ProcessDeclAttributeList(
   // good to have a way to specify "these attributes must appear as a group",
   // for these. Additionally, it would be good to have a way to specify "these
   // attribute must never appear as a group" for attributes like cold and hot.
-  const FunctionType *FnTy = D->getFunctionType();
-  if (!D->hasAttr() && FnTy &&
-  FnTy->getCallConv() != CallingConv::CC_AMDGPUKernelCall) {
+  if (!D->hasAttr()) {
 // These attributes cannot be applied to a non-kernel function.
 if (const auto *A = D->getAttr()) {
   // FIXME: This emits a different error message than
@@ -7165,25 +7163,27 @@ void Sema::ProcessDeclAttributeList(
 } else if (const auto *A = D->getAttr()) {
   Diag(D->getLocation(), diag::err_opencl_kernel_attr) << A;
   D->setInvalidDecl();
-} else if (!D->hasAttr() &&
-   !D->hasAttr()) {
-  if (const auto *A = D->getAttr()) {
-Diag(D->getLocation(), diag::err_attribute_wrong_decl_type)
-<< A << A->isRegularKeywordAttribute() << ExpectedKernelFunction;
-D->setInvalidDecl();
-  } else if (const auto *A = D->getAttr()) {
-Diag(D->getLocation(), diag::err_attribute_wrong_decl_type)
-<< A <

[clang] [Clang] Fix sema checks thinking kernels aren't kernels (PR #104460)

2024-08-15 Thread Joseph Huber via cfe-commits

https://github.com/jhuber6 updated 
https://github.com/llvm/llvm-project/pull/104460

>From 84c89feaea8135f7dfaba488b442818974b51c9d Mon Sep 17 00:00:00 2001
From: Joseph Huber 
Date: Thu, 15 Aug 2024 10:34:17 -0500
Subject: [PATCH 1/3] [Clang] Fix sema checks thinking kernels aren't kernels

Summary:
Currently we have some sema checks to make sure users don't apply
kernel-only attributes to non-kernel functions. However, this currently
did not correctly check for bare NVPTX / AMDGPU kernel attributes,
making it impossible to use them at all w/o CUDA enabled. This patch
fixes that by checking for the calling convention / attributes directly.
---
 clang/lib/Sema/SemaDeclAttr.cpp  | 7 +--
 clang/test/CodeGenCXX/amdgpu-kernel-arg-pointer-type.cpp | 5 +
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 3b5e984f4ee773..96c3759de7edc1 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -7147,7 +7147,9 @@ void Sema::ProcessDeclAttributeList(
   // good to have a way to specify "these attributes must appear as a group",
   // for these. Additionally, it would be good to have a way to specify "these
   // attribute must never appear as a group" for attributes like cold and hot.
-  if (!D->hasAttr()) {
+  const FunctionType *FnTy = D->getFunctionType();
+  if (!D->hasAttr() && FnTy &&
+  FnTy->getCallConv() != CallingConv::CC_AMDGPUKernelCall) {
 // These attributes cannot be applied to a non-kernel function.
 if (const auto *A = D->getAttr()) {
   // FIXME: This emits a different error message than
@@ -7163,7 +7165,8 @@ void Sema::ProcessDeclAttributeList(
 } else if (const auto *A = D->getAttr()) {
   Diag(D->getLocation(), diag::err_opencl_kernel_attr) << A;
   D->setInvalidDecl();
-} else if (!D->hasAttr()) {
+} else if (!D->hasAttr() &&
+   !D->hasAttr()) {
   if (const auto *A = D->getAttr()) {
 Diag(D->getLocation(), diag::err_attribute_wrong_decl_type)
 << A << A->isRegularKeywordAttribute() << ExpectedKernelFunction;
diff --git a/clang/test/CodeGenCXX/amdgpu-kernel-arg-pointer-type.cpp 
b/clang/test/CodeGenCXX/amdgpu-kernel-arg-pointer-type.cpp
index c8fb12315d0eec..8df1d75e9ec8fa 100644
--- a/clang/test/CodeGenCXX/amdgpu-kernel-arg-pointer-type.cpp
+++ b/clang/test/CodeGenCXX/amdgpu-kernel-arg-pointer-type.cpp
@@ -6,6 +6,10 @@
 // The original test passes the result through opt O2, but that seems to 
introduce invalid
 // addrspace casts which are not being fixed as part of the present change.
 
+// COMMON: define{{.*}} amdgpu_kernel void @_Z6kernelv() #[[ATTR:[0-9]+]]
+__attribute__((amdgpu_kernel, amdgpu_flat_work_group_size(1, 256))) void
+kernel() {}
+
 // COMMON-LABEL: define{{.*}} amdgpu_kernel void @_Z7kernel1Pi(ptr {{.*}} %x)
 // CHECK-NOT: ={{.*}} addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to ptr
 __attribute__((amdgpu_kernel)) void kernel1(int *x) {
@@ -81,3 +85,4 @@ __attribute__((amdgpu_kernel)) void kernel8(struct SS a) {
   *a.x += 3.f;
 }
 
+// COMMON: attributes #[[ATTR]] = { 
{{.*}}"amdgpu-flat-work-group-size"="1,256"{{.*}} }

>From 25012dc87e9cb694b74f5f5c1e4105cca9954e20 Mon Sep 17 00:00:00 2001
From: Joseph Huber 
Date: Thu, 15 Aug 2024 11:07:49 -0500
Subject: [PATCH 2/3] Address comments

---
 clang/lib/Sema/SemaDeclAttr.cpp | 44 -
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 96c3759de7edc1..c10fd87488b2b4 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -7147,9 +7147,7 @@ void Sema::ProcessDeclAttributeList(
   // good to have a way to specify "these attributes must appear as a group",
   // for these. Additionally, it would be good to have a way to specify "these
   // attribute must never appear as a group" for attributes like cold and hot.
-  const FunctionType *FnTy = D->getFunctionType();
-  if (!D->hasAttr() && FnTy &&
-  FnTy->getCallConv() != CallingConv::CC_AMDGPUKernelCall) {
+  if (!D->hasAttr()) {
 // These attributes cannot be applied to a non-kernel function.
 if (const auto *A = D->getAttr()) {
   // FIXME: This emits a different error message than
@@ -7165,25 +7163,27 @@ void Sema::ProcessDeclAttributeList(
 } else if (const auto *A = D->getAttr()) {
   Diag(D->getLocation(), diag::err_opencl_kernel_attr) << A;
   D->setInvalidDecl();
-} else if (!D->hasAttr() &&
-   !D->hasAttr()) {
-  if (const auto *A = D->getAttr()) {
-Diag(D->getLocation(), diag::err_attribute_wrong_decl_type)
-<< A << A->isRegularKeywordAttribute() << ExpectedKernelFunction;
-D->setInvalidDecl();
-  } else if (const auto *A = D->getAttr()) {
-Diag(D->getLocation(), diag::err_attribute_wrong_decl_type)
-<< A <

[clang] [Clang] Fix sema checks thinking kernels aren't kernels (PR #104460)

2024-08-15 Thread Joseph Huber via cfe-commits


@@ -7163,24 +7163,27 @@ void Sema::ProcessDeclAttributeList(
 } else if (const auto *A = D->getAttr()) {
   Diag(D->getLocation(), diag::err_opencl_kernel_attr) << A;
   D->setInvalidDecl();
-} else if (!D->hasAttr()) {
-  if (const auto *A = D->getAttr()) {
-Diag(D->getLocation(), diag::err_attribute_wrong_decl_type)
-<< A << A->isRegularKeywordAttribute() << ExpectedKernelFunction;
-D->setInvalidDecl();
-  } else if (const auto *A = D->getAttr()) {
-Diag(D->getLocation(), diag::err_attribute_wrong_decl_type)
-<< A << A->isRegularKeywordAttribute() << ExpectedKernelFunction;
-D->setInvalidDecl();
-  } else if (const auto *A = D->getAttr()) {
-Diag(D->getLocation(), diag::err_attribute_wrong_decl_type)
-<< A << A->isRegularKeywordAttribute() << ExpectedKernelFunction;
-D->setInvalidDecl();
-  } else if (const auto *A = D->getAttr()) {
-Diag(D->getLocation(), diag::err_attribute_wrong_decl_type)
-<< A << A->isRegularKeywordAttribute() << ExpectedKernelFunction;
-D->setInvalidDecl();
-  }
+}
+  }
+  const FunctionType *FnTy = D->getFunctionType();
+  if (!D->hasAttr() && !D->hasAttr() &&
+  FnTy && FnTy->getCallConv() != CallingConv::CC_AMDGPUKernelCall) {
+if (const auto *A = D->getAttr()) {
+  Diag(D->getLocation(), diag::err_attribute_wrong_decl_type)
+  << A << A->isRegularKeywordAttribute() << ExpectedKernelFunction;
+  D->setInvalidDecl();
+} else if (const auto *A = D->getAttr()) {
+  Diag(D->getLocation(), diag::err_attribute_wrong_decl_type)
+  << A << A->isRegularKeywordAttribute() << ExpectedKernelFunction;
+  D->setInvalidDecl();
+} else if (const auto *A = D->getAttr()) {
+  Diag(D->getLocation(), diag::err_attribute_wrong_decl_type)
+  << A << A->isRegularKeywordAttribute() << ExpectedKernelFunction;
+  D->setInvalidDecl();
+} else if (const auto *A = D->getAttr()) {
+  Diag(D->getLocation(), diag::err_attribute_wrong_decl_type)
+  << A << A->isRegularKeywordAttribute() << ExpectedKernelFunction;
+  D->setInvalidDecl();

jhuber6 wrote:

Done

https://github.com/llvm/llvm-project/pull/104460
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang] Fix sema checks thinking kernels aren't kernels (PR #104460)

2024-08-15 Thread Joseph Huber via cfe-commits


@@ -7163,24 +7163,27 @@ void Sema::ProcessDeclAttributeList(
 } else if (const auto *A = D->getAttr()) {
   Diag(D->getLocation(), diag::err_opencl_kernel_attr) << A;
   D->setInvalidDecl();
-} else if (!D->hasAttr()) {
-  if (const auto *A = D->getAttr()) {
-Diag(D->getLocation(), diag::err_attribute_wrong_decl_type)
-<< A << A->isRegularKeywordAttribute() << ExpectedKernelFunction;
-D->setInvalidDecl();
-  } else if (const auto *A = D->getAttr()) {
-Diag(D->getLocation(), diag::err_attribute_wrong_decl_type)
-<< A << A->isRegularKeywordAttribute() << ExpectedKernelFunction;
-D->setInvalidDecl();
-  } else if (const auto *A = D->getAttr()) {
-Diag(D->getLocation(), diag::err_attribute_wrong_decl_type)
-<< A << A->isRegularKeywordAttribute() << ExpectedKernelFunction;
-D->setInvalidDecl();
-  } else if (const auto *A = D->getAttr()) {
-Diag(D->getLocation(), diag::err_attribute_wrong_decl_type)
-<< A << A->isRegularKeywordAttribute() << ExpectedKernelFunction;
-D->setInvalidDecl();
-  }
+}
+  }
+  const FunctionType *FnTy = D->getFunctionType();
+  if (!D->hasAttr() && !D->hasAttr() &&
+  FnTy && FnTy->getCallConv() != CallingConv::CC_AMDGPUKernelCall) {

jhuber6 wrote:

Made it a helper.

https://github.com/llvm/llvm-project/pull/104460
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang] Fix sema checks thinking kernels aren't kernels (PR #104460)

2024-08-15 Thread Joseph Huber via cfe-commits

https://github.com/jhuber6 updated 
https://github.com/llvm/llvm-project/pull/104460

>From 84c89feaea8135f7dfaba488b442818974b51c9d Mon Sep 17 00:00:00 2001
From: Joseph Huber 
Date: Thu, 15 Aug 2024 10:34:17 -0500
Subject: [PATCH 1/3] [Clang] Fix sema checks thinking kernels aren't kernels

Summary:
Currently we have some sema checks to make sure users don't apply
kernel-only attributes to non-kernel functions. However, this currently
did not correctly check for bare NVPTX / AMDGPU kernel attributes,
making it impossible to use them at all w/o CUDA enabled. This patch
fixes that by checking for the calling convention / attributes directly.
---
 clang/lib/Sema/SemaDeclAttr.cpp  | 7 +--
 clang/test/CodeGenCXX/amdgpu-kernel-arg-pointer-type.cpp | 5 +
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 3b5e984f4ee773..96c3759de7edc1 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -7147,7 +7147,9 @@ void Sema::ProcessDeclAttributeList(
   // good to have a way to specify "these attributes must appear as a group",
   // for these. Additionally, it would be good to have a way to specify "these
   // attribute must never appear as a group" for attributes like cold and hot.
-  if (!D->hasAttr()) {
+  const FunctionType *FnTy = D->getFunctionType();
+  if (!D->hasAttr() && FnTy &&
+  FnTy->getCallConv() != CallingConv::CC_AMDGPUKernelCall) {
 // These attributes cannot be applied to a non-kernel function.
 if (const auto *A = D->getAttr()) {
   // FIXME: This emits a different error message than
@@ -7163,7 +7165,8 @@ void Sema::ProcessDeclAttributeList(
 } else if (const auto *A = D->getAttr()) {
   Diag(D->getLocation(), diag::err_opencl_kernel_attr) << A;
   D->setInvalidDecl();
-} else if (!D->hasAttr()) {
+} else if (!D->hasAttr() &&
+   !D->hasAttr()) {
   if (const auto *A = D->getAttr()) {
 Diag(D->getLocation(), diag::err_attribute_wrong_decl_type)
 << A << A->isRegularKeywordAttribute() << ExpectedKernelFunction;
diff --git a/clang/test/CodeGenCXX/amdgpu-kernel-arg-pointer-type.cpp 
b/clang/test/CodeGenCXX/amdgpu-kernel-arg-pointer-type.cpp
index c8fb12315d0eec..8df1d75e9ec8fa 100644
--- a/clang/test/CodeGenCXX/amdgpu-kernel-arg-pointer-type.cpp
+++ b/clang/test/CodeGenCXX/amdgpu-kernel-arg-pointer-type.cpp
@@ -6,6 +6,10 @@
 // The original test passes the result through opt O2, but that seems to 
introduce invalid
 // addrspace casts which are not being fixed as part of the present change.
 
+// COMMON: define{{.*}} amdgpu_kernel void @_Z6kernelv() #[[ATTR:[0-9]+]]
+__attribute__((amdgpu_kernel, amdgpu_flat_work_group_size(1, 256))) void
+kernel() {}
+
 // COMMON-LABEL: define{{.*}} amdgpu_kernel void @_Z7kernel1Pi(ptr {{.*}} %x)
 // CHECK-NOT: ={{.*}} addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to ptr
 __attribute__((amdgpu_kernel)) void kernel1(int *x) {
@@ -81,3 +85,4 @@ __attribute__((amdgpu_kernel)) void kernel8(struct SS a) {
   *a.x += 3.f;
 }
 
+// COMMON: attributes #[[ATTR]] = { 
{{.*}}"amdgpu-flat-work-group-size"="1,256"{{.*}} }

>From 25012dc87e9cb694b74f5f5c1e4105cca9954e20 Mon Sep 17 00:00:00 2001
From: Joseph Huber 
Date: Thu, 15 Aug 2024 11:07:49 -0500
Subject: [PATCH 2/3] Address comments

---
 clang/lib/Sema/SemaDeclAttr.cpp | 44 -
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 96c3759de7edc1..c10fd87488b2b4 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -7147,9 +7147,7 @@ void Sema::ProcessDeclAttributeList(
   // good to have a way to specify "these attributes must appear as a group",
   // for these. Additionally, it would be good to have a way to specify "these
   // attribute must never appear as a group" for attributes like cold and hot.
-  const FunctionType *FnTy = D->getFunctionType();
-  if (!D->hasAttr() && FnTy &&
-  FnTy->getCallConv() != CallingConv::CC_AMDGPUKernelCall) {
+  if (!D->hasAttr()) {
 // These attributes cannot be applied to a non-kernel function.
 if (const auto *A = D->getAttr()) {
   // FIXME: This emits a different error message than
@@ -7165,25 +7163,27 @@ void Sema::ProcessDeclAttributeList(
 } else if (const auto *A = D->getAttr()) {
   Diag(D->getLocation(), diag::err_opencl_kernel_attr) << A;
   D->setInvalidDecl();
-} else if (!D->hasAttr() &&
-   !D->hasAttr()) {
-  if (const auto *A = D->getAttr()) {
-Diag(D->getLocation(), diag::err_attribute_wrong_decl_type)
-<< A << A->isRegularKeywordAttribute() << ExpectedKernelFunction;
-D->setInvalidDecl();
-  } else if (const auto *A = D->getAttr()) {
-Diag(D->getLocation(), diag::err_attribute_wrong_decl_type)
-<< A <

[clang] [Clang] Fix sema checks thinking kernels aren't kernels (PR #104460)

2024-08-15 Thread Joseph Huber via cfe-commits

https://github.com/jhuber6 updated 
https://github.com/llvm/llvm-project/pull/104460

>From 84c89feaea8135f7dfaba488b442818974b51c9d Mon Sep 17 00:00:00 2001
From: Joseph Huber 
Date: Thu, 15 Aug 2024 10:34:17 -0500
Subject: [PATCH 1/2] [Clang] Fix sema checks thinking kernels aren't kernels

Summary:
Currently we have some sema checks to make sure users don't apply
kernel-only attributes to non-kernel functions. However, this currently
did not correctly check for bare NVPTX / AMDGPU kernel attributes,
making it impossible to use them at all w/o CUDA enabled. This patch
fixes that by checking for the calling convention / attributes directly.
---
 clang/lib/Sema/SemaDeclAttr.cpp  | 7 +--
 clang/test/CodeGenCXX/amdgpu-kernel-arg-pointer-type.cpp | 5 +
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 3b5e984f4ee773..96c3759de7edc1 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -7147,7 +7147,9 @@ void Sema::ProcessDeclAttributeList(
   // good to have a way to specify "these attributes must appear as a group",
   // for these. Additionally, it would be good to have a way to specify "these
   // attribute must never appear as a group" for attributes like cold and hot.
-  if (!D->hasAttr()) {
+  const FunctionType *FnTy = D->getFunctionType();
+  if (!D->hasAttr() && FnTy &&
+  FnTy->getCallConv() != CallingConv::CC_AMDGPUKernelCall) {
 // These attributes cannot be applied to a non-kernel function.
 if (const auto *A = D->getAttr()) {
   // FIXME: This emits a different error message than
@@ -7163,7 +7165,8 @@ void Sema::ProcessDeclAttributeList(
 } else if (const auto *A = D->getAttr()) {
   Diag(D->getLocation(), diag::err_opencl_kernel_attr) << A;
   D->setInvalidDecl();
-} else if (!D->hasAttr()) {
+} else if (!D->hasAttr() &&
+   !D->hasAttr()) {
   if (const auto *A = D->getAttr()) {
 Diag(D->getLocation(), diag::err_attribute_wrong_decl_type)
 << A << A->isRegularKeywordAttribute() << ExpectedKernelFunction;
diff --git a/clang/test/CodeGenCXX/amdgpu-kernel-arg-pointer-type.cpp 
b/clang/test/CodeGenCXX/amdgpu-kernel-arg-pointer-type.cpp
index c8fb12315d0eec..8df1d75e9ec8fa 100644
--- a/clang/test/CodeGenCXX/amdgpu-kernel-arg-pointer-type.cpp
+++ b/clang/test/CodeGenCXX/amdgpu-kernel-arg-pointer-type.cpp
@@ -6,6 +6,10 @@
 // The original test passes the result through opt O2, but that seems to 
introduce invalid
 // addrspace casts which are not being fixed as part of the present change.
 
+// COMMON: define{{.*}} amdgpu_kernel void @_Z6kernelv() #[[ATTR:[0-9]+]]
+__attribute__((amdgpu_kernel, amdgpu_flat_work_group_size(1, 256))) void
+kernel() {}
+
 // COMMON-LABEL: define{{.*}} amdgpu_kernel void @_Z7kernel1Pi(ptr {{.*}} %x)
 // CHECK-NOT: ={{.*}} addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to ptr
 __attribute__((amdgpu_kernel)) void kernel1(int *x) {
@@ -81,3 +85,4 @@ __attribute__((amdgpu_kernel)) void kernel8(struct SS a) {
   *a.x += 3.f;
 }
 
+// COMMON: attributes #[[ATTR]] = { 
{{.*}}"amdgpu-flat-work-group-size"="1,256"{{.*}} }

>From 25012dc87e9cb694b74f5f5c1e4105cca9954e20 Mon Sep 17 00:00:00 2001
From: Joseph Huber 
Date: Thu, 15 Aug 2024 11:07:49 -0500
Subject: [PATCH 2/2] Address comments

---
 clang/lib/Sema/SemaDeclAttr.cpp | 44 -
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 96c3759de7edc1..c10fd87488b2b4 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -7147,9 +7147,7 @@ void Sema::ProcessDeclAttributeList(
   // good to have a way to specify "these attributes must appear as a group",
   // for these. Additionally, it would be good to have a way to specify "these
   // attribute must never appear as a group" for attributes like cold and hot.
-  const FunctionType *FnTy = D->getFunctionType();
-  if (!D->hasAttr() && FnTy &&
-  FnTy->getCallConv() != CallingConv::CC_AMDGPUKernelCall) {
+  if (!D->hasAttr()) {
 // These attributes cannot be applied to a non-kernel function.
 if (const auto *A = D->getAttr()) {
   // FIXME: This emits a different error message than
@@ -7165,25 +7163,27 @@ void Sema::ProcessDeclAttributeList(
 } else if (const auto *A = D->getAttr()) {
   Diag(D->getLocation(), diag::err_opencl_kernel_attr) << A;
   D->setInvalidDecl();
-} else if (!D->hasAttr() &&
-   !D->hasAttr()) {
-  if (const auto *A = D->getAttr()) {
-Diag(D->getLocation(), diag::err_attribute_wrong_decl_type)
-<< A << A->isRegularKeywordAttribute() << ExpectedKernelFunction;
-D->setInvalidDecl();
-  } else if (const auto *A = D->getAttr()) {
-Diag(D->getLocation(), diag::err_attribute_wrong_decl_type)
-<< A <

[clang] [Clang] Fix sema checks thinking kernels aren't kernels (PR #104460)

2024-08-15 Thread Joseph Huber via cfe-commits

https://github.com/jhuber6 updated 
https://github.com/llvm/llvm-project/pull/104460

>From 84c89feaea8135f7dfaba488b442818974b51c9d Mon Sep 17 00:00:00 2001
From: Joseph Huber 
Date: Thu, 15 Aug 2024 10:34:17 -0500
Subject: [PATCH 1/2] [Clang] Fix sema checks thinking kernels aren't kernels

Summary:
Currently we have some sema checks to make sure users don't apply
kernel-only attributes to non-kernel functions. However, this currently
did not correctly check for bare NVPTX / AMDGPU kernel attributes,
making it impossible to use them at all w/o CUDA enabled. This patch
fixes that by checking for the calling convention / attributes directly.
---
 clang/lib/Sema/SemaDeclAttr.cpp  | 7 +--
 clang/test/CodeGenCXX/amdgpu-kernel-arg-pointer-type.cpp | 5 +
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 3b5e984f4ee773..96c3759de7edc1 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -7147,7 +7147,9 @@ void Sema::ProcessDeclAttributeList(
   // good to have a way to specify "these attributes must appear as a group",
   // for these. Additionally, it would be good to have a way to specify "these
   // attribute must never appear as a group" for attributes like cold and hot.
-  if (!D->hasAttr()) {
+  const FunctionType *FnTy = D->getFunctionType();
+  if (!D->hasAttr() && FnTy &&
+  FnTy->getCallConv() != CallingConv::CC_AMDGPUKernelCall) {
 // These attributes cannot be applied to a non-kernel function.
 if (const auto *A = D->getAttr()) {
   // FIXME: This emits a different error message than
@@ -7163,7 +7165,8 @@ void Sema::ProcessDeclAttributeList(
 } else if (const auto *A = D->getAttr()) {
   Diag(D->getLocation(), diag::err_opencl_kernel_attr) << A;
   D->setInvalidDecl();
-} else if (!D->hasAttr()) {
+} else if (!D->hasAttr() &&
+   !D->hasAttr()) {
   if (const auto *A = D->getAttr()) {
 Diag(D->getLocation(), diag::err_attribute_wrong_decl_type)
 << A << A->isRegularKeywordAttribute() << ExpectedKernelFunction;
diff --git a/clang/test/CodeGenCXX/amdgpu-kernel-arg-pointer-type.cpp 
b/clang/test/CodeGenCXX/amdgpu-kernel-arg-pointer-type.cpp
index c8fb12315d0eec..8df1d75e9ec8fa 100644
--- a/clang/test/CodeGenCXX/amdgpu-kernel-arg-pointer-type.cpp
+++ b/clang/test/CodeGenCXX/amdgpu-kernel-arg-pointer-type.cpp
@@ -6,6 +6,10 @@
 // The original test passes the result through opt O2, but that seems to 
introduce invalid
 // addrspace casts which are not being fixed as part of the present change.
 
+// COMMON: define{{.*}} amdgpu_kernel void @_Z6kernelv() #[[ATTR:[0-9]+]]
+__attribute__((amdgpu_kernel, amdgpu_flat_work_group_size(1, 256))) void
+kernel() {}
+
 // COMMON-LABEL: define{{.*}} amdgpu_kernel void @_Z7kernel1Pi(ptr {{.*}} %x)
 // CHECK-NOT: ={{.*}} addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to ptr
 __attribute__((amdgpu_kernel)) void kernel1(int *x) {
@@ -81,3 +85,4 @@ __attribute__((amdgpu_kernel)) void kernel8(struct SS a) {
   *a.x += 3.f;
 }
 
+// COMMON: attributes #[[ATTR]] = { 
{{.*}}"amdgpu-flat-work-group-size"="1,256"{{.*}} }

>From 6a89582e570f2e2e7c347fedd3bbcf8b5b20eba2 Mon Sep 17 00:00:00 2001
From: Joseph Huber 
Date: Thu, 15 Aug 2024 11:07:49 -0500
Subject: [PATCH 2/2] Address comments

---
 clang/lib/Sema/SemaDeclAttr.cpp | 44 -
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 96c3759de7edc1..91464f739f1025 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -7147,9 +7147,7 @@ void Sema::ProcessDeclAttributeList(
   // good to have a way to specify "these attributes must appear as a group",
   // for these. Additionally, it would be good to have a way to specify "these
   // attribute must never appear as a group" for attributes like cold and hot.
-  const FunctionType *FnTy = D->getFunctionType();
-  if (!D->hasAttr() && FnTy &&
-  FnTy->getCallConv() != CallingConv::CC_AMDGPUKernelCall) {
+  if (!D->hasAttr()) {
 // These attributes cannot be applied to a non-kernel function.
 if (const auto *A = D->getAttr()) {
   // FIXME: This emits a different error message than
@@ -7165,25 +7163,27 @@ void Sema::ProcessDeclAttributeList(
 } else if (const auto *A = D->getAttr()) {
   Diag(D->getLocation(), diag::err_opencl_kernel_attr) << A;
   D->setInvalidDecl();
-} else if (!D->hasAttr() &&
-   !D->hasAttr()) {
-  if (const auto *A = D->getAttr()) {
-Diag(D->getLocation(), diag::err_attribute_wrong_decl_type)
-<< A << A->isRegularKeywordAttribute() << ExpectedKernelFunction;
-D->setInvalidDecl();
-  } else if (const auto *A = D->getAttr()) {
-Diag(D->getLocation(), diag::err_attribute_wrong_decl_type)
-<< A <

[clang] [Clang] Fix sema checks thinking kernels aren't kernels (PR #104460)

2024-08-15 Thread Joseph Huber via cfe-commits


@@ -7163,7 +7165,8 @@ void Sema::ProcessDeclAttributeList(
 } else if (const auto *A = D->getAttr()) {
   Diag(D->getLocation(), diag::err_opencl_kernel_attr) << A;
   D->setInvalidDecl();
-} else if (!D->hasAttr()) {
+} else if (!D->hasAttr() &&

jhuber6 wrote:

Updated, I think this is what you meant?

https://github.com/llvm/llvm-project/pull/104460
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


  1   2   3   4   5   6   7   8   9   10   >