[clang] [libc] [Clang][NFC] Replace device specific kernel attribute with generic one (PR #176250)

Joseph Huber via cfe-commits Thu, 15 Jan 2026 13:39:47 -0800

https://github.com/jhuber6 created 
https://github.com/llvm/llvm-project/pull/176250


Summary:
The old `amdgpu_kernel` and `nvptx_kernel` attributes are better
replaced by the new, generic `device_kernel`.


>From 5d55beb44ac52545a9c8ccabed415dabed20af88 Mon Sep 17 00:00:00 2001
From: Joseph Huber <[email protected]>
Date: Thu, 15 Jan 2026 15:32:09 -0600
Subject: [PATCH] [Clang][NFC] Replace device specific kernel attribute with
 generic one

Summary:
The old `amdgpu_kernel` and `nvptx_kernel` attributes are better
replaced by the new, generic `device_kernel`.
---
 clang/lib/Headers/amdgpuintrin.h  | 3 ---
 clang/lib/Headers/gpuintrin.h     | 3 +++
 clang/lib/Headers/nvptxintrin.h   | 3 ---
 clang/lib/Headers/spirvintrin.h   | 3 ---
 libc/startup/gpu/amdgpu/start.cpp | 4 ++--
 libc/startup/gpu/nvptx/start.cpp  | 6 +++---
 6 files changed, 8 insertions(+), 14 deletions(-)

diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
index f7fb8e2814180..e0989e0a2d097 100644
--- a/clang/lib/Headers/amdgpuintrin.h
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -27,9 +27,6 @@ _Pragma("omp begin declare variant match(device = 
{arch(amdgcn)})");
 #define __gpu_global __attribute__((address_space(1)))
 #define __gpu_generic __attribute__((address_space(0)))
 
-// Attribute to declare a function as a kernel.
-#define __gpu_kernel __attribute__((amdgpu_kernel, visibility("protected")))
-
 // Returns the number of workgroups in the 'x' dimension of the grid.
 _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
   return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index f3cf2d0776c0c..010ec2264dc5f 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -69,6 +69,9 @@ _Pragma("omp end declare target");
 _Pragma("omp begin declare target device_type(nohost)");
 _Pragma("omp begin declare variant match(device = {kind(gpu)})");
 
+// Attribute to declare a function as a kernel.
+#define __gpu_kernel __attribute__((device_kernel, visibility("protected")))
+
 #define __GPU_X_DIM 0
 #define __GPU_Y_DIM 1
 #define __GPU_Z_DIM 2
diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h
index fb811d0d58394..b2e538580ba10 100644
--- a/clang/lib/Headers/nvptxintrin.h
+++ b/clang/lib/Headers/nvptxintrin.h
@@ -31,9 +31,6 @@ _Pragma("omp begin declare variant match(device = 
{arch(nvptx64)})");
 #define __gpu_global __attribute__((address_space(1)))
 #define __gpu_generic __attribute__((address_space(0)))
 
-// Attribute to declare a function as a kernel.
-#define __gpu_kernel __attribute__((nvptx_kernel, visibility("protected")))
-
 // Returns the number of CUDA blocks in the 'x' dimension.
 _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
   return __nvvm_read_ptx_sreg_nctaid_x();
diff --git a/clang/lib/Headers/spirvintrin.h b/clang/lib/Headers/spirvintrin.h
index 2a10a47adedde..9658f280b247d 100644
--- a/clang/lib/Headers/spirvintrin.h
+++ b/clang/lib/Headers/spirvintrin.h
@@ -27,9 +27,6 @@ _Pragma("omp begin declare variant match(device = 
{arch(spirv64)})");
 #define __gpu_global __attribute__((address_space(1)))
 #define __gpu_generic __attribute__((address_space(4)))
 
-// Attribute to declare a function as a kernel.
-#define __gpu_kernel __attribute__((device_kernel, visibility("protected")))
-
 // Returns the number of workgroups in the 'x' dimension of the grid.
 _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
   return __builtin_spirv_num_workgroups(0);
diff --git a/libc/startup/gpu/amdgpu/start.cpp 
b/libc/startup/gpu/amdgpu/start.cpp
index 446eead4e3935..47c7e1ccf10bd 100644
--- a/libc/startup/gpu/amdgpu/start.cpp
+++ b/libc/startup/gpu/amdgpu/start.cpp
@@ -45,7 +45,7 @@ static void call_fini_array_callbacks() {
 
 } // namespace LIBC_NAMESPACE_DECL
 
-extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel,
+extern "C" [[gnu::visibility("protected"), clang::device_kernel,
              clang::amdgpu_flat_work_group_size(1, 1),
              clang::amdgpu_max_num_work_groups(1)]] void
 _begin(int argc, char **argv, char **env) {
@@ -59,7 +59,7 @@ _begin(int argc, char **argv, char **env) {
   LIBC_NAMESPACE::call_init_array_callbacks(argc, argv, env);
 }
 
-extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel]] void
+extern "C" [[gnu::visibility("protected"), clang::device_kernel]] void
 _start(int argc, char **argv, char **envp, int *ret) {
   // Invoke the 'main' function with every active thread that the user launched
   // the _start kernel with.
diff --git a/libc/startup/gpu/nvptx/start.cpp b/libc/startup/gpu/nvptx/start.cpp
index be71bafa7c458..dc15b1be8c04f 100644
--- a/libc/startup/gpu/nvptx/start.cpp
+++ b/libc/startup/gpu/nvptx/start.cpp
@@ -51,7 +51,7 @@ static void call_fini_array_callbacks() {
 
 } // namespace LIBC_NAMESPACE_DECL
 
-extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void
+extern "C" [[gnu::visibility("protected"), clang::device_kernel]] void
 _begin(int argc, char **argv, char **env) {
   __atomic_store_n(&LIBC_NAMESPACE::app.env_ptr,
                    reinterpret_cast<uintptr_t *>(env), __ATOMIC_RELAXED);
@@ -64,14 +64,14 @@ _begin(int argc, char **argv, char **env) {
   LIBC_NAMESPACE::call_init_array_callbacks(argc, argv, env);
 }
 
-extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void
+extern "C" [[gnu::visibility("protected"), clang::device_kernel]] void
 _start(int argc, char **argv, char **envp, int *ret) {
   // Invoke the 'main' function with every active thread that the user launched
   // the _start kernel with.
   __atomic_fetch_or(ret, main(argc, argv, envp), __ATOMIC_RELAXED);
 }
 
-extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void _end() {
+extern "C" [[gnu::visibility("protected"), clang::device_kernel]] void _end() {
   // Only a single thread should call the destructors registred with 'atexit'.
   // The loader utility will handle the actual exit and return code cleanly.
   __cxa_finalize(nullptr);

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [libc] [Clang][NFC] Replace device specific kernel attribute with generic one (PR #176250)

Reply via email to