https://github.com/RiverDave updated https://github.com/llvm/llvm-project/pull/177790
>From f703a61ff33f2d8026cf4bece0416afc331e4dbf Mon Sep 17 00:00:00 2001 From: David Rivera <[email protected]> Date: Sat, 24 Jan 2026 14:12:48 -0500 Subject: [PATCH 1/6] [CIR][CUDA] Upstream device stub mangling --- clang/lib/CIR/CodeGen/CIRGenFunction.cpp | 4 +- clang/lib/CIR/CodeGen/CIRGenModule.cpp | 10 ++- .../test/CIR/CodeGen/CUDA/kernel-stub-name.cu | 22 ++++++ clang/test/CIR/CodeGen/inputs/cuda.h | 74 +++++++++++++++++++ 4 files changed, 107 insertions(+), 3 deletions(-) create mode 100644 clang/test/CIR/CodeGen/CUDA/kernel-stub-name.cu create mode 100644 clang/test/CIR/CodeGen/inputs/cuda.h diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp index f2d73720a9c2b..4c212b06019ea 100644 --- a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp @@ -748,7 +748,9 @@ cir::FuncOp CIRGenFunction::generateCode(clang::GlobalDecl gd, cir::FuncOp fn, emitConstructorBody(args); } else if (getLangOpts().CUDA && !getLangOpts().CUDAIsDevice && funcDecl->hasAttr<CUDAGlobalAttr>()) { - getCIRGenModule().errorNYI(bodyRange, "CUDA kernel"); + // TODO(cir): Emit device stub body with kernel launch runtime calls + // (emitDeviceStub). For now, emit an empty stub. + assert(!cir::MissingFeatures::cudaSupport()); } else if (isa<CXXMethodDecl>(funcDecl) && cast<CXXMethodDecl>(funcDecl)->isLambdaStaticInvoker()) { // The lambda static invoker function is special, because it forwards or diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp index 61d84f197e6ec..b535eab913a5d 100644 --- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp @@ -1772,9 +1772,15 @@ static std::string getMangledNameImpl(CIRGenModule &cgm, GlobalDecl gd, cgm.errorNYI(nd->getSourceRange(), "getMangledName: X86RegCall"); } else if (fd && fd->hasAttr<CUDAGlobalAttr>() && gd.getKernelReferenceKind() == KernelReferenceKind::Stub) { - cgm.errorNYI(nd->getSourceRange(), "getMangledName: CUDA device stub"); + out << "__device_stub__" << ii->getName(); + } else if (fd && + DeviceKernelAttr::isOpenCLSpelling( + fd->getAttr<DeviceKernelAttr>()) && + gd.getKernelReferenceKind() == KernelReferenceKind::Stub) { + cgm.errorNYI(nd->getSourceRange(), "getMangledName: OpenCL Stub"); + } else { + out << ii->getName(); } - out << ii->getName(); } // Check if the module name hash should be appended for internal linkage diff --git a/clang/test/CIR/CodeGen/CUDA/kernel-stub-name.cu b/clang/test/CIR/CodeGen/CUDA/kernel-stub-name.cu new file mode 100644 index 0000000000000..6d5efb69827e3 --- /dev/null +++ b/clang/test/CIR/CodeGen/CUDA/kernel-stub-name.cu @@ -0,0 +1,22 @@ +// Based on clang/test/CodeGenCUDA/kernel-stub-name.cu. + +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-cir %s \ +// RUN: -x cuda -o %t.cir +// RUN: FileCheck --input-file=%t.cir %s + +#include "../inputs/cuda.h" + +// CHECK: cir.func {{.*}} @__device_stub__ckernel() +// CHECK-NEXT: cir.return +// CHECK-NEXT: } +extern "C" __global__ void ckernel() {} + +// CHECK: cir.func {{.*}} @_ZN2ns23__device_stub__nskernelEv() +namespace ns { +__global__ void nskernel() {} +} // namespace ns + +// CHECK: cir.func {{.*}} @_Z25__device_stub__kernelfuncIiEvv() +template <class T> +__global__ void kernelfunc() {} +template __global__ void kernelfunc<int>(); diff --git a/clang/test/CIR/CodeGen/inputs/cuda.h b/clang/test/CIR/CodeGen/inputs/cuda.h new file mode 100644 index 0000000000000..204bf2972088d --- /dev/null +++ b/clang/test/CIR/CodeGen/inputs/cuda.h @@ -0,0 +1,74 @@ +/* Minimal declarations for CUDA support. Testing purposes only. */ +/* From test/CodeGenCUDA/Inputs/cuda.h. */ +#include <stddef.h> + +#if __HIP__ || __CUDA__ +#define __constant__ __attribute__((constant)) +#define __device__ __attribute__((device)) +#define __global__ __attribute__((global)) +#define __host__ __attribute__((host)) +#define __shared__ __attribute__((shared)) +#if __HIP__ +#define __managed__ __attribute__((managed)) +#endif +#define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__))) +#define __grid_constant__ __attribute__((grid_constant)) +#else +#define __constant__ +#define __device__ +#define __global__ +#define __host__ +#define __shared__ +#define __managed__ +#define __launch_bounds__(...) +#define __grid_constant__ +#endif + +struct dim3 { + unsigned x, y, z; + __host__ __device__ dim3(unsigned x, unsigned y = 1, unsigned z = 1) : x(x), y(y), z(z) {} +}; + +#if __HIP__ || HIP_PLATFORM +typedef struct hipStream *hipStream_t; +typedef enum hipError {} hipError_t; +int hipConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0, + hipStream_t stream = 0); +extern "C" hipError_t __hipPushCallConfiguration(dim3 gridSize, dim3 blockSize, + size_t sharedSize = 0, + hipStream_t stream = 0); +#ifndef __HIP_API_PER_THREAD_DEFAULT_STREAM__ +extern "C" hipError_t hipLaunchKernel(const void *func, dim3 gridDim, + dim3 blockDim, void **args, + size_t sharedMem, + hipStream_t stream); +#else +extern "C" hipError_t hipLaunchKernel_spt(const void *func, dim3 gridDim, + dim3 blockDim, void **args, + size_t sharedMem, + hipStream_t stream); +#endif // __HIP_API_PER_THREAD_DEFAULT_STREAM__ +#elif __OFFLOAD_VIA_LLVM__ +extern "C" unsigned __llvmPushCallConfiguration(dim3 gridDim, dim3 blockDim, + size_t sharedMem = 0, void *stream = 0); +extern "C" unsigned llvmLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, + void **args, size_t sharedMem = 0, void *stream = 0); +#else +typedef struct cudaStream *cudaStream_t; +typedef enum cudaError {} cudaError_t; +extern "C" int cudaConfigureCall(dim3 gridSize, dim3 blockSize, + size_t sharedSize = 0, + cudaStream_t stream = 0); +extern "C" int __cudaPushCallConfiguration(dim3 gridSize, dim3 blockSize, + size_t sharedSize = 0, + cudaStream_t stream = 0); +extern "C" cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, + dim3 blockDim, void **args, + size_t sharedMem, cudaStream_t stream); +extern "C" cudaError_t cudaLaunchKernel_ptsz(const void *func, dim3 gridDim, + dim3 blockDim, void **args, + size_t sharedMem, cudaStream_t stream); + +#endif + +extern "C" __device__ int printf(const char*, ...); >From 1892d27374892cb04af492c4cc63a9129056d257 Mon Sep 17 00:00:00 2001 From: David Rivera <[email protected]> Date: Sat, 24 Jan 2026 14:39:09 -0500 Subject: [PATCH 2/6] make test include cleaner --- clang/test/CIR/CodeGen/CUDA/kernel-stub-name.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/test/CIR/CodeGen/CUDA/kernel-stub-name.cu b/clang/test/CIR/CodeGen/CUDA/kernel-stub-name.cu index 6d5efb69827e3..da2dbd9843c7c 100644 --- a/clang/test/CIR/CodeGen/CUDA/kernel-stub-name.cu +++ b/clang/test/CIR/CodeGen/CUDA/kernel-stub-name.cu @@ -1,10 +1,10 @@ // Based on clang/test/CodeGenCUDA/kernel-stub-name.cu. // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-cir %s \ -// RUN: -x cuda -o %t.cir +// RUN: -I%S/../inputs/ -x cuda -o %t.cir // RUN: FileCheck --input-file=%t.cir %s -#include "../inputs/cuda.h" +#include "cuda.h" // CHECK: cir.func {{.*}} @__device_stub__ckernel() // CHECK-NEXT: cir.return >From 601edf35ccd87c52d5319922238ebb3196265a6c Mon Sep 17 00:00:00 2001 From: David Rivera <[email protected]> Date: Tue, 27 Jan 2026 15:59:09 -0500 Subject: [PATCH 3/6] [CIR][CUDA][HIP] Implement stub body emission --- clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp | 358 ++++++++++++++++++ clang/lib/CIR/CodeGen/CIRGenCUDARuntime.cpp | 20 + clang/lib/CIR/CodeGen/CIRGenCUDARuntime.h | 50 +++ clang/lib/CIR/CodeGen/CIRGenFunction.cpp | 4 +- clang/lib/CIR/CodeGen/CIRGenModule.cpp | 14 +- clang/lib/CIR/CodeGen/CIRGenModule.h | 9 + clang/lib/CIR/CodeGen/CMakeLists.txt | 2 + clang/test/CIR/CodeGen/CUDA/kernel-call.cu | 18 + .../test/CIR/CodeGen/CUDA/kernel-stub-name.cu | 8 +- clang/test/CIR/CodeGen/inputs/cuda.h | 6 + 10 files changed, 481 insertions(+), 8 deletions(-) create mode 100644 clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp create mode 100644 clang/lib/CIR/CodeGen/CIRGenCUDARuntime.cpp create mode 100644 clang/lib/CIR/CodeGen/CIRGenCUDARuntime.h create mode 100644 clang/test/CIR/CodeGen/CUDA/kernel-call.cu diff --git a/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp b/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp new file mode 100644 index 0000000000000..acdc811b7a308 --- /dev/null +++ b/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp @@ -0,0 +1,358 @@ +//===- CIRGenCUDANV.cpp - Interface to NVIDIA CUDA Runtime -----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This provides an abstract class for CUDA CIR generation. Concrete +// subclasses of this implement code generation for specific OpenCL +// runtime libraries. +// +//===----------------------------------------------------------------------===// + +#include "CIRGenCUDARuntime.h" +#include "CIRGenFunction.h" +#include "CIRGenModule.h" +#include "mlir/IR/Operation.h" +#include "clang/AST/ASTContext.h" +#include "clang/AST/Decl.h" +#include "clang/AST/GlobalDecl.h" +#include "clang/Basic/AddressSpaces.h" +#include "clang/Basic/Cuda.h" +#include "clang/CIR/Dialect/IR/CIRDialect.h" +#include "clang/CIR/Dialect/IR/CIRTypes.h" +#include "llvm/Support/Casting.h" + +using namespace clang; +using namespace clang::CIRGen; + +namespace { + +class CIRGenNVCUDARuntime : public CIRGenCUDARuntime { +protected: + StringRef Prefix; + + // Map a device stub function to a symbol for identifying kernel in host + // code. For CUDA, the symbol for identifying the kernel is the same as the + // device stub function. For HIP, they are different. + llvm::DenseMap<StringRef, mlir::Operation *> kernelHandles; + + // Map a kernel handle to the kernel stub. + llvm::DenseMap<mlir::Operation *, mlir::Operation *> kernelStubs; + // Mangle context for device. + std::unique_ptr<MangleContext> deviceMC; + +private: + void emitDeviceStubBodyNew(CIRGenFunction &cgf, cir::FuncOp fn, + FunctionArgList &args); + mlir::Value prepareKernelArgs(CIRGenFunction &cgf, mlir::Location loc, + FunctionArgList &args); + mlir::Operation *getKernelHandle(cir::FuncOp fn, GlobalDecl gd) override; + std::string addPrefixToName(StringRef funcName) const; + std::string addUnderscoredPrefixToName(StringRef funcName) const; + +public: + CIRGenNVCUDARuntime(CIRGenModule &cgm); + ~CIRGenNVCUDARuntime(); + + void emitDeviceStub(CIRGenFunction &cgf, cir::FuncOp fn, + FunctionArgList &args) override; +}; + +} // namespace + +std::string CIRGenNVCUDARuntime::addPrefixToName(StringRef funcName) const { + return (Prefix + funcName).str(); +} + +std::string +CIRGenNVCUDARuntime::addUnderscoredPrefixToName(StringRef funcName) const { + return ("__" + Prefix + funcName).str(); +} + +static std::unique_ptr<MangleContext> initDeviceMC(CIRGenModule &cgm) { + // If the host and device have different C++ ABIs, mark it as the device + // mangle context so that the mangling needs to retrieve the additional + // device lambda mangling number instead of the regular host one. + if (cgm.getASTContext().getAuxTargetInfo() && + cgm.getASTContext().getTargetInfo().getCXXABI().isMicrosoft() && + cgm.getASTContext().getAuxTargetInfo()->getCXXABI().isItaniumFamily()) { + return std::unique_ptr<MangleContext>( + cgm.getASTContext().createDeviceMangleContext( + *cgm.getASTContext().getAuxTargetInfo())); + } + + return std::unique_ptr<MangleContext>(cgm.getASTContext().createMangleContext( + cgm.getASTContext().getAuxTargetInfo())); +} + +CIRGenNVCUDARuntime::CIRGenNVCUDARuntime(CIRGenModule &cgm) + : CIRGenCUDARuntime(cgm), deviceMC(initDeviceMC(cgm)) { + if (cgm.getLangOpts().OffloadViaLLVM) + llvm_unreachable("NYI"); + else if (cgm.getLangOpts().HIP) + Prefix = "hip"; + else + Prefix = "cuda"; +} + +mlir::Value CIRGenNVCUDARuntime::prepareKernelArgs(CIRGenFunction &cgf, + mlir::Location loc, + FunctionArgList &args) { + auto &builder = cgm.getBuilder(); + + // Build void *args[] and populate with the addresses of kernel arguments. + auto voidPtrArrayTy = cir::ArrayType::get(cgm.voidPtrTy, args.size()); + mlir::Value kernelArgs = builder.createAlloca( + loc, cir::PointerType::get(voidPtrArrayTy), voidPtrArrayTy, "kernel_args", + CharUnits::fromQuantity(16)); + + mlir::Value kernelArgsDecayed = + builder.createCast(cir::CastKind::array_to_ptrdecay, kernelArgs, + cir::PointerType::get(cgm.voidPtrTy)); + + for (auto [i, arg] : llvm::enumerate(args)) { + mlir::Value index = + builder.getConstInt(loc, llvm::APInt(/*numBits=*/32, i)); + mlir::Value storePos = + builder.createPtrStride(loc, kernelArgsDecayed, index); + + // Get the address of the argument and cast the store destination to match + // its pointer-to-pointer type. This is needed because upstream's + // createStore doesn't auto-bitcast like the incubator version. + mlir::Value argAddr = cgf.getAddrOfLocalVar(arg).getPointer(); + mlir::Value storePosTyped = builder.createBitcast( + storePos, cir::PointerType::get(argAddr.getType())); + + builder.CIRBaseBuilderTy::createStore(loc, argAddr, storePosTyped); + } + + return kernelArgsDecayed; +} + +// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local +// array and kernels are launched using cudaLaunchKernel(). +void CIRGenNVCUDARuntime::emitDeviceStubBodyNew(CIRGenFunction &cgf, + cir::FuncOp fn, + FunctionArgList &args) { + + // This requires arguments to be sent to kernels in a different way. + if (cgm.getLangOpts().OffloadViaLLVM) + cgm.errorNYI("Offload via LLVM"); + + auto &builder = cgm.getBuilder(); + auto loc = fn.getLoc(); + + // For [cuda|hip]LaunchKernel, we must add another layer of indirection + // to arguments. For example, for function `add(int a, float b)`, + // we need to pass it as `void *args[2] = { &a, &b }`. + mlir::Value kernelArgs = prepareKernelArgs(cgf, loc, args); + + // Lookup cudaLaunchKernel/hipLaunchKernel function. + // HIP kernel launching API name depends on -fgpu-default-stream option. For + // the default value 'legacy', it is hipLaunchKernel. For 'per-thread', + // it is hipLaunchKernel_spt. + // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, + // void **args, size_t sharedMem, + // cudaStream_t stream); + // hipError_t hipLaunchKernel[_spt](const void *func, dim3 gridDim, + // dim3 blockDim, void **args, + // size_t sharedMem, hipStream_t stream); + TranslationUnitDecl *tuDecl = cgm.getASTContext().getTranslationUnitDecl(); + DeclContext *dc = TranslationUnitDecl::castToDeclContext(tuDecl); + + // The default stream is usually stream 0 (the legacy default stream). + // For per-thread default stream, we need a different LaunchKernel function. + std::string kernelLaunchAPI = "LaunchKernel"; + if (cgm.getLangOpts().GPUDefaultStream == + LangOptions::GPUDefaultStreamKind::PerThread) + cgm.errorNYI("CUDA/HIP Stream per thread"); + + std::string launchKernelName = addPrefixToName(kernelLaunchAPI); + const IdentifierInfo &launchII = + cgm.getASTContext().Idents.get(launchKernelName); + FunctionDecl *cudaLaunchKernelFD = nullptr; + for (auto *result : dc->lookup(&launchII)) { + if (FunctionDecl *fd = dyn_cast<FunctionDecl>(result)) + cudaLaunchKernelFD = fd; + } + + if (cudaLaunchKernelFD == nullptr) { + cgm.error(cgf.curFuncDecl->getLocation(), + "Can't find declaration for " + launchKernelName); + return; + } + + // Use this function to retrieve arguments for cudaLaunchKernel: + // int __[cuda|hip]PopCallConfiguration(dim3 *gridDim, dim3 *blockDim, size_t + // *sharedMem, cudaStream_t *stream) + // + // Here [cuda|hip]Stream_t, while also being the 6th argument of + // [cuda|hip]LaunchKernel, is a pointer to some opaque struct. + + mlir::Type dim3Ty = cgf.getTypes().convertType( + cudaLaunchKernelFD->getParamDecl(1)->getType()); + mlir::Type streamTy = cgf.getTypes().convertType( + cudaLaunchKernelFD->getParamDecl(5)->getType()); + + mlir::Value gridDim = + builder.createAlloca(loc, cir::PointerType::get(dim3Ty), dim3Ty, + "grid_dim", CharUnits::fromQuantity(8)); + mlir::Value blockDim = + builder.createAlloca(loc, cir::PointerType::get(dim3Ty), dim3Ty, + "block_dim", CharUnits::fromQuantity(8)); + mlir::Value sharedMem = + builder.createAlloca(loc, cir::PointerType::get(cgm.sizeTy), cgm.sizeTy, + "shared_mem", cgm.getSizeAlign()); + mlir::Value stream = + builder.createAlloca(loc, cir::PointerType::get(streamTy), streamTy, + "stream", cgm.getPointerAlign()); + + cir::FuncOp popConfig = cgm.createRuntimeFunction( + cir::FuncType::get({gridDim.getType(), blockDim.getType(), + sharedMem.getType(), stream.getType()}, + cgm.sInt32Ty), + addUnderscoredPrefixToName("PopCallConfiguration")); + cgf.emitRuntimeCall(loc, popConfig, {gridDim, blockDim, sharedMem, stream}); + + // Now emit the call to cudaLaunchKernel + // [cuda|hip]Error_t [cuda|hip]LaunchKernel(const void *func, dim3 gridDim, + // dim3 blockDim, + // void **args, size_t sharedMem, + // [cuda|hip]Stream_t stream); + + // We now either pick the function or the stub global for cuda, hip + // resepectively. + auto kernel = [&]() { + if (auto globalOp = llvm::dyn_cast_or_null<cir::GlobalOp>( + kernelHandles[fn.getSymName()])) { + auto kernelTy = cir::PointerType::get(globalOp.getSymType()); + mlir::Value kernel = cir::GetGlobalOp::create(builder, loc, kernelTy, + globalOp.getSymName()); + return kernel; + } + if (auto funcOp = llvm::dyn_cast_or_null<cir::FuncOp>( + kernelHandles[fn.getSymName()])) { + auto kernelTy = cir::PointerType::get(funcOp.getFunctionType()); + mlir::Value kernel = + cir::GetGlobalOp::create(builder, loc, kernelTy, funcOp.getSymName()); + mlir::Value func = builder.createBitcast(kernel, cgm.voidPtrTy); + return func; + } + assert(false && "Expected stub handle to be cir::GlobalOp or funcOp"); + }(); + + CallArgList launchArgs; + launchArgs.add(RValue::get(kernel), + cudaLaunchKernelFD->getParamDecl(0)->getType()); + launchArgs.add( + RValue::getAggregate(Address(gridDim, CharUnits::fromQuantity(8))), + cudaLaunchKernelFD->getParamDecl(1)->getType()); + launchArgs.add( + RValue::getAggregate(Address(blockDim, CharUnits::fromQuantity(8))), + cudaLaunchKernelFD->getParamDecl(2)->getType()); + launchArgs.add(RValue::get(kernelArgs), + cudaLaunchKernelFD->getParamDecl(3)->getType()); + launchArgs.add( + RValue::get(builder.CIRBaseBuilderTy::createLoad(loc, sharedMem)), + cudaLaunchKernelFD->getParamDecl(4)->getType()); + launchArgs.add(RValue::get(builder.CIRBaseBuilderTy::createLoad(loc, stream)), + cudaLaunchKernelFD->getParamDecl(5)->getType()); + + mlir::Type launchTy = + cgm.getTypes().convertType(cudaLaunchKernelFD->getType()); + mlir::Operation *cudaKernelLauncherFn = cgm.createRuntimeFunction( + cast<cir::FuncType>(launchTy), launchKernelName); + const auto &callInfo = + cgm.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD); + cgf.emitCall(callInfo, CIRGenCallee::forDirect(cudaKernelLauncherFn), + ReturnValueSlot(), launchArgs); + + if (cgm.getASTContext().getTargetInfo().getCXXABI().isMicrosoft() && + !cgf.getLangOpts().HIP) + cgm.errorNYI("MSVC CUDA stub handling"); +} + +void CIRGenNVCUDARuntime::emitDeviceStub(CIRGenFunction &cgf, cir::FuncOp fn, + FunctionArgList &args) { + + if (auto globalOp = + llvm::dyn_cast<cir::GlobalOp>(kernelHandles[fn.getSymName()])) { + auto &builder = cgm.getBuilder(); + auto fnPtrTy = globalOp.getSymType(); + auto sym = mlir::FlatSymbolRefAttr::get(fn.getSymNameAttr()); + auto gv = cir::GlobalViewAttr::get(fnPtrTy, sym); + + globalOp->setAttr("initial_value", gv); + globalOp->removeAttr("sym_visibility"); + globalOp->setAttr("alignment", builder.getI64IntegerAttr( + cgm.getPointerAlign().getQuantity())); + } + + // CUDA 9.0 changed the way to launch kernels. + if (CudaFeatureEnabled(cgm.getTarget().getSDKVersion(), + CudaFeature::CUDA_USES_NEW_LAUNCH) || + (cgm.getLangOpts().HIP && cgm.getLangOpts().HIPUseNewLaunchAPI) || + cgm.getLangOpts().OffloadViaLLVM) + emitDeviceStubBodyNew(cgf, fn, args); + else + cgm.errorNYI("Emit Stub Body Legacy"); +} + +CIRGenCUDARuntime *clang::CIRGen::createNVCUDARuntime(CIRGenModule &cgm) { + return new CIRGenNVCUDARuntime(cgm); +} + +CIRGenNVCUDARuntime::~CIRGenNVCUDARuntime() {} + +mlir::Operation *CIRGenNVCUDARuntime::getKernelHandle(cir::FuncOp fn, + GlobalDecl GD) { + + // Check if we already have a kernel handle for this function + auto Loc = kernelHandles.find(fn.getSymName()); + if (Loc != kernelHandles.end()) { + auto OldHandle = Loc->second; + // Here we know that the fn did not change. Return it + if (kernelStubs[OldHandle] == fn) + return OldHandle; + + // We've found the function name, but F itself has changed, so we need to + // update the references. + if (cgm.getLangOpts().HIP) { + // For HIP compilation the handle itself does not change, so we only need + // to update the Stub value. + kernelStubs[OldHandle] = fn; + return OldHandle; + } + // For non-HIP compilation, erase the old Stub and fall-through to creating + // new entries. + kernelStubs.erase(OldHandle); + } + + // If not targeting HIP, store the function itself + if (!cgm.getLangOpts().HIP) { + kernelHandles[fn.getSymName()] = fn; + kernelStubs[fn] = fn; + return fn; + } + + // Create a new CIR global variable to represent the kernel handle + auto &builder = cgm.getBuilder(); + auto globalName = cgm.getMangledName( + GD.getWithKernelReferenceKind(KernelReferenceKind::Kernel)); + const VarDecl *varDecl = llvm::dyn_cast_or_null<VarDecl>(GD.getDecl()); + cir::GlobalOp globalOp = + cgm.getOrCreateCIRGlobal(globalName, fn.getFunctionType().getReturnType(), + LangAS::Default, varDecl, NotForDefinition); + + globalOp->setAttr("alignment", builder.getI64IntegerAttr( + cgm.getPointerAlign().getQuantity())); + + // Store references + kernelHandles[fn.getSymName()] = globalOp; + kernelStubs[globalOp] = fn; + + return globalOp; +} \ No newline at end of file diff --git a/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.cpp b/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.cpp new file mode 100644 index 0000000000000..c438c968c24ce --- /dev/null +++ b/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.cpp @@ -0,0 +1,20 @@ +//===----- CIRGenCUDARuntime.cpp - Interface to CUDA Runtimes -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This provides an abstract class for CUDA CIR generation. Concrete +// subclasses of this implement code generation for specific CUDA +// runtime libraries. +// +//===----------------------------------------------------------------------===// + +#include "CIRGenCUDARuntime.h" + +using namespace clang; +using namespace CIRGen; + +CIRGenCUDARuntime::~CIRGenCUDARuntime() {} diff --git a/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.h b/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.h new file mode 100644 index 0000000000000..a0809c1d185b8 --- /dev/null +++ b/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.h @@ -0,0 +1,50 @@ +//===------ CIRGenCUDARuntime.h - Interface to CUDA Runtimes -----*- C++ -*-==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This provides an abstract class for CUDA CIR generation. Concrete +// subclasses of this implement code generation for specific OpenCL +// runtime libraries. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_LIB_CIR_CIRGENCUDARUNTIME_H +#define LLVM_CLANG_LIB_CIR_CIRGENCUDARUNTIME_H + +#include "clang/CIR/Dialect/IR/CIRDialect.h" + +namespace clang { +class CUDAKernelCallExpr; +} + +namespace clang::CIRGen { + +class CIRGenFunction; +class CIRGenModule; +class FunctionArgList; +class RValue; +class ReturnValueSlot; + +class CIRGenCUDARuntime { +protected: + CIRGenModule &cgm; + +public: + CIRGenCUDARuntime(CIRGenModule &cgm) : cgm(cgm) {} + virtual ~CIRGenCUDARuntime(); + + virtual void emitDeviceStub(CIRGenFunction &cgf, cir::FuncOp fn, + FunctionArgList &args) = 0; + + virtual mlir::Operation *getKernelHandle(cir::FuncOp fn, GlobalDecl gd) = 0; +}; + +CIRGenCUDARuntime *createNVCUDARuntime(CIRGenModule &cgm); + +} // namespace clang::CIRGen + +#endif // LLVM_CLANG_LIB_CIR_CIRGENCUDARUNTIME_H diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp index 4c212b06019ea..c900797e54c81 100644 --- a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp @@ -748,9 +748,7 @@ cir::FuncOp CIRGenFunction::generateCode(clang::GlobalDecl gd, cir::FuncOp fn, emitConstructorBody(args); } else if (getLangOpts().CUDA && !getLangOpts().CUDAIsDevice && funcDecl->hasAttr<CUDAGlobalAttr>()) { - // TODO(cir): Emit device stub body with kernel launch runtime calls - // (emitDeviceStub). For now, emit an empty stub. - assert(!cir::MissingFeatures::cudaSupport()); + cgm.getCUDARuntime().emitDeviceStub(*this, fn, args); } else if (isa<CXXMethodDecl>(funcDecl) && cast<CXXMethodDecl>(funcDecl)->isLambdaStaticInvoker()) { // The lambda static invoker function is special, because it forwards or diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp index b535eab913a5d..8cef5408bbfc1 100644 --- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// #include "CIRGenModule.h" +#include "CIRGenCUDARuntime.h" #include "CIRGenCXXABI.h" #include "CIRGenConstantEmitter.h" #include "CIRGenFunction.h" @@ -31,6 +32,7 @@ #include "mlir/IR/BuiltinOps.h" #include "mlir/IR/Location.h" #include "mlir/IR/MLIRContext.h" +#include "mlir/IR/Operation.h" #include "mlir/IR/Verifier.h" #include <algorithm> @@ -68,7 +70,8 @@ CIRGenModule::CIRGenModule(mlir::MLIRContext &mlirContext, langOpts(astContext.getLangOpts()), codeGenOpts(cgo), theModule{mlir::ModuleOp::create(mlir::UnknownLoc::get(&mlirContext))}, diags(diags), target(astContext.getTargetInfo()), - abi(createCXXABI(*this)), genTypes(*this), vtables(*this) { + abi(createCXXABI(*this)), genTypes(*this), vtables(*this), + cudaRuntime(clang::CIRGen::createNVCUDARuntime((*this))) { // Initialize cached types voidTy = cir::VoidType::get(&getMLIRContext()); @@ -1748,6 +1751,15 @@ cir::FuncOp CIRGenModule::getAddrOfFunction(clang::GlobalDecl gd, cir::FuncOp func = getOrCreateCIRFunction(mangledName, funcType, gd, forVTable, dontDefer, /*isThunk=*/false, isForDefinition); + // Returns kernel handle for HIP kernel stub function. + if (langOpts.CUDA && !langOpts.CUDAIsDevice && + cast<FunctionDecl>(gd.getDecl())->hasAttr<CUDAGlobalAttr>()) { + mlir::Operation *handle = getCUDARuntime().getKernelHandle(func, gd); + + if (isForDefinition) + return func; + return mlir::dyn_cast<cir::FuncOp>(*handle); + } return func; } diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.h b/clang/lib/CIR/CodeGen/CIRGenModule.h index 3c4f35bacc4f9..6f301bd83d373 100644 --- a/clang/lib/CIR/CodeGen/CIRGenModule.h +++ b/clang/lib/CIR/CodeGen/CIRGenModule.h @@ -15,6 +15,7 @@ #include "CIRGenBuilder.h" #include "CIRGenCall.h" +#include "CIRGenCUDARuntime.h" #include "CIRGenTypeCache.h" #include "CIRGenTypes.h" #include "CIRGenVTables.h" @@ -90,6 +91,9 @@ class CIRGenModule : public CIRGenTypeCache { /// Holds information about C++ vtables. CIRGenVTables vtables; + /// Holds the CUDA runtime + std::unique_ptr<CIRGenCUDARuntime> cudaRuntime; + /// Per-function codegen information. Updated everytime emitCIR is called /// for FunctionDecls's. CIRGenFunction *curCGF = nullptr; @@ -593,6 +597,11 @@ class CIRGenModule : public CIRGenTypeCache { /// Function* for "fabsf". cir::FuncOp getBuiltinLibFunction(const FunctionDecl *fd, unsigned builtinID); + CIRGenCUDARuntime &getCUDARuntime() { + assert(cudaRuntime != nullptr); + return *cudaRuntime; + } + mlir::IntegerAttr getSize(CharUnits size) { return builder.getSizeFromCharUnits(size); } diff --git a/clang/lib/CIR/CodeGen/CMakeLists.txt b/clang/lib/CIR/CodeGen/CMakeLists.txt index 8efa587f31aac..ff5e666a72bef 100644 --- a/clang/lib/CIR/CodeGen/CMakeLists.txt +++ b/clang/lib/CIR/CodeGen/CMakeLists.txt @@ -18,6 +18,8 @@ add_clang_library(clangCIR CIRGenClass.cpp CIRGenCleanup.cpp CIRGenCoroutine.cpp + CIRGenCUDANV.cpp + CIRGenCUDARuntime.cpp CIRGenCXX.cpp CIRGenCXXABI.cpp CIRGenDecl.cpp diff --git a/clang/test/CIR/CodeGen/CUDA/kernel-call.cu b/clang/test/CIR/CodeGen/CUDA/kernel-call.cu new file mode 100644 index 0000000000000..d1dae134b0230 --- /dev/null +++ b/clang/test/CIR/CodeGen/CUDA/kernel-call.cu @@ -0,0 +1,18 @@ +// Based on clang/test/CodeGenCUDA/kernel-call.cu. +// Tests device stub body emission for CUDA kernels. + +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -target-sdk-version=9.2 \ +// RUN: -emit-cir %s -I%S/../inputs/ -x cuda -o %t.cir +// RUN: FileCheck --input-file=%t.cir %s --check-prefix=CUDA-NEW + + +#include "cuda.h" + + +// TODO: Test CUDA legacy (< 9.0) when legacy stub body is implemented +// TODO: Test HIP when HIP stub body support is complete + +// CUDA-NEW-LABEL: cir.func {{.*}} @_Z21__device_stub__kernelv +// CUDA-NEW: cir.call @__cudaPopCallConfiguration +// CUDA-NEW: cir.call @cudaLaunchKernel +__global__ void kernel() {} diff --git a/clang/test/CIR/CodeGen/CUDA/kernel-stub-name.cu b/clang/test/CIR/CodeGen/CUDA/kernel-stub-name.cu index da2dbd9843c7c..0edf256ccf961 100644 --- a/clang/test/CIR/CodeGen/CUDA/kernel-stub-name.cu +++ b/clang/test/CIR/CodeGen/CUDA/kernel-stub-name.cu @@ -1,13 +1,13 @@ // Based on clang/test/CodeGenCUDA/kernel-stub-name.cu. -// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-cir %s \ -// RUN: -I%S/../inputs/ -x cuda -o %t.cir +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -target-sdk-version=9.2 \ +// RUN: -emit-cir %s -I%S/../inputs/ -x cuda -o %t.cir // RUN: FileCheck --input-file=%t.cir %s #include "cuda.h" -// CHECK: cir.func {{.*}} @__device_stub__ckernel() -// CHECK-NEXT: cir.return +// CHECK: cir.func {{.*}} @[[CSTUB:__device_stub__ckernel]]() +// CHECK: cir.return // CHECK-NEXT: } extern "C" __global__ void ckernel() {} diff --git a/clang/test/CIR/CodeGen/inputs/cuda.h b/clang/test/CIR/CodeGen/inputs/cuda.h index 204bf2972088d..225c7dfdcf0db 100644 --- a/clang/test/CIR/CodeGen/inputs/cuda.h +++ b/clang/test/CIR/CodeGen/inputs/cuda.h @@ -37,6 +37,9 @@ int hipConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0, extern "C" hipError_t __hipPushCallConfiguration(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0, hipStream_t stream = 0); +extern "C" int __hipPopCallConfiguration(dim3 *gridSize, dim3 *blockSize, + size_t *sharedSize, + hipStream_t *stream); #ifndef __HIP_API_PER_THREAD_DEFAULT_STREAM__ extern "C" hipError_t hipLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, @@ -62,6 +65,9 @@ extern "C" int cudaConfigureCall(dim3 gridSize, dim3 blockSize, extern "C" int __cudaPushCallConfiguration(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0, cudaStream_t stream = 0); +extern "C" int __cudaPopCallConfiguration(dim3 *gridSize, dim3 *blockSize, + size_t *sharedSize, + cudaStream_t *stream); extern "C" cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream); >From 4509e1ce33324ab380eca5c27e8af8385f6e1a38 Mon Sep 17 00:00:00 2001 From: David Rivera <[email protected]> Date: Tue, 27 Jan 2026 16:01:55 -0500 Subject: [PATCH 4/6] fix fmt --- clang/lib/CIR/CodeGen/CIRGenModule.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.h b/clang/lib/CIR/CodeGen/CIRGenModule.h index 6f301bd83d373..9b12a5fe26e04 100644 --- a/clang/lib/CIR/CodeGen/CIRGenModule.h +++ b/clang/lib/CIR/CodeGen/CIRGenModule.h @@ -14,8 +14,8 @@ #define LLVM_CLANG_LIB_CIR_CODEGEN_CIRGENMODULE_H #include "CIRGenBuilder.h" -#include "CIRGenCall.h" #include "CIRGenCUDARuntime.h" +#include "CIRGenCall.h" #include "CIRGenTypeCache.h" #include "CIRGenTypes.h" #include "CIRGenVTables.h" >From b260ed6dcade871d9e7bbaf92fda49eff3e61fbb Mon Sep 17 00:00:00 2001 From: David Rivera <[email protected]> Date: Tue, 27 Jan 2026 16:05:09 -0500 Subject: [PATCH 5/6] nit: parity with og on runtime headers --- clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp | 5 ++--- clang/lib/CIR/CodeGen/CIRGenCUDARuntime.cpp | 2 +- clang/lib/CIR/CodeGen/CIRGenCUDARuntime.h | 4 ++-- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp b/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp index acdc811b7a308..9a6eaafcbd439 100644 --- a/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp @@ -6,9 +6,8 @@ // //===----------------------------------------------------------------------===// // -// This provides an abstract class for CUDA CIR generation. Concrete -// subclasses of this implement code generation for specific OpenCL -// runtime libraries. +// This provides a class for CUDA code generation targeting the NVIDIA CUDA +// runtime library. // //===----------------------------------------------------------------------===// diff --git a/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.cpp b/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.cpp index c438c968c24ce..14189ad7a52f3 100644 --- a/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// // -// This provides an abstract class for CUDA CIR generation. Concrete +// This provides an abstract class for CUDA code generation. Concrete // subclasses of this implement code generation for specific CUDA // runtime libraries. // diff --git a/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.h b/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.h index a0809c1d185b8..83eb0c02188ba 100644 --- a/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.h +++ b/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// // -// This provides an abstract class for CUDA CIR generation. Concrete -// subclasses of this implement code generation for specific OpenCL +// This provides an abstract class for CUDA code generation. Concrete +// subclasses of this implement code generation for specific CUDA // runtime libraries. // //===----------------------------------------------------------------------===// >From 8e3701e1e3605ff48f366b82b91ffb2dc10a6e32 Mon Sep 17 00:00:00 2001 From: David Rivera <[email protected]> Date: Fri, 30 Jan 2026 13:04:17 -0500 Subject: [PATCH 6/6] address comments and adapt a bunch of lines to proper coding standards --- clang/include/clang/AST/ASTContext.h | 2 + clang/lib/AST/ASTContext.cpp | 12 +++ clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp | 116 +++++++++------------ clang/lib/CIR/CodeGen/CIRGenModule.cpp | 11 +- clang/lib/CIR/CodeGen/CIRGenModule.h | 2 + clang/test/CIR/CodeGen/CUDA/kernel-call.cu | 40 ++++++- 6 files changed, 111 insertions(+), 72 deletions(-) diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h index 68205dd1c1fd9..c8d6de1689512 100644 --- a/clang/include/clang/AST/ASTContext.h +++ b/clang/include/clang/AST/ASTContext.h @@ -2883,6 +2883,8 @@ class ASTContext : public RefCountedBase<ASTContext> { /// (from the AuxTargetInfo) is a an itanium target. MangleContext *createDeviceMangleContext(const TargetInfo &T); + MangleContext *cudaNVInitDeviceMC(); + void DeepCollectObjCIvars(const ObjCInterfaceDecl *OI, bool leafClass, SmallVectorImpl<const ObjCIvarDecl*> &Ivars) const; diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp index f52470a4d7458..3f63420cae91e 100644 --- a/clang/lib/AST/ASTContext.cpp +++ b/clang/lib/AST/ASTContext.cpp @@ -13224,6 +13224,18 @@ MangleContext *ASTContext::createDeviceMangleContext(const TargetInfo &T) { llvm_unreachable("Unsupported ABI"); } +MangleContext *ASTContext::cudaNVInitDeviceMC() { + // If the host and device have different C++ ABIs, mark it as the device + // mangle context so that the mangling needs to retrieve the additional + // device lambda mangling number instead of the regular host one. + if (getAuxTargetInfo() && getTargetInfo().getCXXABI().isMicrosoft() && + getAuxTargetInfo()->getCXXABI().isItaniumFamily()) { + return createDeviceMangleContext(*getAuxTargetInfo()); + } + + return createMangleContext(getAuxTargetInfo()); +} + CXXABI::~CXXABI() = default; size_t ASTContext::getSideTableAllocatedMemory() const { diff --git a/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp b/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp index 9a6eaafcbd439..434c8003af27c 100644 --- a/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp @@ -1,4 +1,4 @@ -//===- CIRGenCUDANV.cpp - Interface to NVIDIA CUDA Runtime -----===// +//========- CIRGenCUDANV.cpp - Interface to NVIDIA CUDA Runtime -----=========// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -31,12 +31,12 @@ namespace { class CIRGenNVCUDARuntime : public CIRGenCUDARuntime { protected: - StringRef Prefix; + StringRef prefix; // Map a device stub function to a symbol for identifying kernel in host // code. For CUDA, the symbol for identifying the kernel is the same as the // device stub function. For HIP, they are different. - llvm::DenseMap<StringRef, mlir::Operation *> kernelHandles; + llvm::StringMap<mlir::Operation *> kernelHandles; // Map a kernel handle to the kernel stub. llvm::DenseMap<mlir::Operation *, mlir::Operation *> kernelStubs; @@ -63,44 +63,29 @@ class CIRGenNVCUDARuntime : public CIRGenCUDARuntime { } // namespace std::string CIRGenNVCUDARuntime::addPrefixToName(StringRef funcName) const { - return (Prefix + funcName).str(); + return (prefix + funcName).str(); } std::string CIRGenNVCUDARuntime::addUnderscoredPrefixToName(StringRef funcName) const { - return ("__" + Prefix + funcName).str(); -} - -static std::unique_ptr<MangleContext> initDeviceMC(CIRGenModule &cgm) { - // If the host and device have different C++ ABIs, mark it as the device - // mangle context so that the mangling needs to retrieve the additional - // device lambda mangling number instead of the regular host one. - if (cgm.getASTContext().getAuxTargetInfo() && - cgm.getASTContext().getTargetInfo().getCXXABI().isMicrosoft() && - cgm.getASTContext().getAuxTargetInfo()->getCXXABI().isItaniumFamily()) { - return std::unique_ptr<MangleContext>( - cgm.getASTContext().createDeviceMangleContext( - *cgm.getASTContext().getAuxTargetInfo())); - } - - return std::unique_ptr<MangleContext>(cgm.getASTContext().createMangleContext( - cgm.getASTContext().getAuxTargetInfo())); + return ("__" + prefix + funcName).str(); } CIRGenNVCUDARuntime::CIRGenNVCUDARuntime(CIRGenModule &cgm) - : CIRGenCUDARuntime(cgm), deviceMC(initDeviceMC(cgm)) { + : CIRGenCUDARuntime(cgm), + deviceMC(cgm.getASTContext().cudaNVInitDeviceMC()) { if (cgm.getLangOpts().OffloadViaLLVM) llvm_unreachable("NYI"); else if (cgm.getLangOpts().HIP) - Prefix = "hip"; + prefix = "hip"; else - Prefix = "cuda"; + prefix = "cuda"; } mlir::Value CIRGenNVCUDARuntime::prepareKernelArgs(CIRGenFunction &cgf, mlir::Location loc, FunctionArgList &args) { - auto &builder = cgm.getBuilder(); + CIRGenBuilderTy &builder = cgm.getBuilder(); // Build void *args[] and populate with the addresses of kernel arguments. auto voidPtrArrayTy = cir::ArrayType::get(cgm.voidPtrTy, args.size()); @@ -112,20 +97,15 @@ mlir::Value CIRGenNVCUDARuntime::prepareKernelArgs(CIRGenFunction &cgf, builder.createCast(cir::CastKind::array_to_ptrdecay, kernelArgs, cir::PointerType::get(cgm.voidPtrTy)); - for (auto [i, arg] : llvm::enumerate(args)) { + for (const auto &[i, arg] : llvm::enumerate(args)) { mlir::Value index = builder.getConstInt(loc, llvm::APInt(/*numBits=*/32, i)); mlir::Value storePos = builder.createPtrStride(loc, kernelArgsDecayed, index); - - // Get the address of the argument and cast the store destination to match - // its pointer-to-pointer type. This is needed because upstream's - // createStore doesn't auto-bitcast like the incubator version. mlir::Value argAddr = cgf.getAddrOfLocalVar(arg).getPointer(); - mlir::Value storePosTyped = builder.createBitcast( - storePos, cir::PointerType::get(argAddr.getType())); + mlir::Value argAsVoid = builder.createBitcast(argAddr, cgm.voidPtrTy); - builder.CIRBaseBuilderTy::createStore(loc, argAddr, storePosTyped); + builder.CIRBaseBuilderTy::createStore(loc, argAsVoid, storePos); } return kernelArgsDecayed; @@ -139,10 +119,13 @@ void CIRGenNVCUDARuntime::emitDeviceStubBodyNew(CIRGenFunction &cgf, // This requires arguments to be sent to kernels in a different way. if (cgm.getLangOpts().OffloadViaLLVM) - cgm.errorNYI("Offload via LLVM"); + cgm.errorNYI("CIRGenNVCUDARuntime: Offload via LLVM"); + + if (cgm.getLangOpts().HIP) + cgm.errorNYI("CIRGenNVCUDARuntime: HIP Support"); - auto &builder = cgm.getBuilder(); - auto loc = fn.getLoc(); + CIRGenBuilderTy &builder = cgm.getBuilder(); + mlir::Location loc = fn.getLoc(); // For [cuda|hip]LaunchKernel, we must add another layer of indirection // to arguments. For example, for function `add(int a, float b)`, @@ -164,7 +147,7 @@ void CIRGenNVCUDARuntime::emitDeviceStubBodyNew(CIRGenFunction &cgf, // The default stream is usually stream 0 (the legacy default stream). // For per-thread default stream, we need a different LaunchKernel function. - std::string kernelLaunchAPI = "LaunchKernel"; + StringRef kernelLaunchAPI = "LaunchKernel"; if (cgm.getLangOpts().GPUDefaultStream == LangOptions::GPUDefaultStreamKind::PerThread) cgm.errorNYI("CUDA/HIP Stream per thread"); @@ -173,7 +156,7 @@ void CIRGenNVCUDARuntime::emitDeviceStubBodyNew(CIRGenFunction &cgf, const IdentifierInfo &launchII = cgm.getASTContext().Idents.get(launchKernelName); FunctionDecl *cudaLaunchKernelFD = nullptr; - for (auto *result : dc->lookup(&launchII)) { + for (NamedDecl *result : dc->lookup(&launchII)) { if (FunctionDecl *fd = dyn_cast<FunctionDecl>(result)) cudaLaunchKernelFD = fd; } @@ -223,24 +206,25 @@ void CIRGenNVCUDARuntime::emitDeviceStubBodyNew(CIRGenFunction &cgf, // [cuda|hip]Stream_t stream); // We now either pick the function or the stub global for cuda, hip - // resepectively. - auto kernel = [&]() { - if (auto globalOp = llvm::dyn_cast_or_null<cir::GlobalOp>( + // respectively. + mlir::Value kernel = [&]() -> mlir::Value { + if (cir::GlobalOp globalOp = llvm::dyn_cast_or_null<cir::GlobalOp>( kernelHandles[fn.getSymName()])) { - auto kernelTy = cir::PointerType::get(globalOp.getSymType()); - mlir::Value kernel = cir::GetGlobalOp::create(builder, loc, kernelTy, - globalOp.getSymName()); - return kernel; + cir::PointerType kernelTy = cir::PointerType::get(globalOp.getSymType()); + mlir::Value kernelVal = cir::GetGlobalOp::create(builder, loc, kernelTy, + globalOp.getSymName()); + return kernelVal; } - if (auto funcOp = llvm::dyn_cast_or_null<cir::FuncOp>( + if (cir::FuncOp funcOp = llvm::dyn_cast_or_null<cir::FuncOp>( kernelHandles[fn.getSymName()])) { - auto kernelTy = cir::PointerType::get(funcOp.getFunctionType()); - mlir::Value kernel = + cir::PointerType kernelTy = + cir::PointerType::get(funcOp.getFunctionType()); + mlir::Value kernelVal = cir::GetGlobalOp::create(builder, loc, kernelTy, funcOp.getSymName()); - mlir::Value func = builder.createBitcast(kernel, cgm.voidPtrTy); + mlir::Value func = builder.createBitcast(kernelVal, cgm.voidPtrTy); return func; } - assert(false && "Expected stub handle to be cir::GlobalOp or funcOp"); + llvm_unreachable("Expected stub handle to be cir::GlobalOp or FuncOp"); }(); CallArgList launchArgs; @@ -264,7 +248,7 @@ void CIRGenNVCUDARuntime::emitDeviceStubBodyNew(CIRGenFunction &cgf, cgm.getTypes().convertType(cudaLaunchKernelFD->getType()); mlir::Operation *cudaKernelLauncherFn = cgm.createRuntimeFunction( cast<cir::FuncType>(launchTy), launchKernelName); - const auto &callInfo = + const CIRGenFunctionInfo &callInfo = cgm.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD); cgf.emitCall(callInfo, CIRGenCallee::forDirect(cudaKernelLauncherFn), ReturnValueSlot(), launchArgs); @@ -279,8 +263,8 @@ void CIRGenNVCUDARuntime::emitDeviceStub(CIRGenFunction &cgf, cir::FuncOp fn, if (auto globalOp = llvm::dyn_cast<cir::GlobalOp>(kernelHandles[fn.getSymName()])) { - auto &builder = cgm.getBuilder(); - auto fnPtrTy = globalOp.getSymType(); + CIRGenBuilderTy &builder = cgm.getBuilder(); + mlir::Type fnPtrTy = globalOp.getSymType(); auto sym = mlir::FlatSymbolRefAttr::get(fn.getSymNameAttr()); auto gv = cir::GlobalViewAttr::get(fnPtrTy, sym); @@ -307,27 +291,27 @@ CIRGenCUDARuntime *clang::CIRGen::createNVCUDARuntime(CIRGenModule &cgm) { CIRGenNVCUDARuntime::~CIRGenNVCUDARuntime() {} mlir::Operation *CIRGenNVCUDARuntime::getKernelHandle(cir::FuncOp fn, - GlobalDecl GD) { + GlobalDecl gd) { // Check if we already have a kernel handle for this function - auto Loc = kernelHandles.find(fn.getSymName()); - if (Loc != kernelHandles.end()) { - auto OldHandle = Loc->second; + auto it = kernelHandles.find(fn.getSymName()); + if (it != kernelHandles.end()) { + mlir::Operation *oldHandle = it->second; // Here we know that the fn did not change. Return it - if (kernelStubs[OldHandle] == fn) - return OldHandle; + if (kernelStubs[oldHandle] == fn) + return oldHandle; // We've found the function name, but F itself has changed, so we need to // update the references. if (cgm.getLangOpts().HIP) { // For HIP compilation the handle itself does not change, so we only need // to update the Stub value. - kernelStubs[OldHandle] = fn; - return OldHandle; + kernelStubs[oldHandle] = fn; + return oldHandle; } // For non-HIP compilation, erase the old Stub and fall-through to creating // new entries. - kernelStubs.erase(OldHandle); + kernelStubs.erase(oldHandle); } // If not targeting HIP, store the function itself @@ -338,10 +322,10 @@ mlir::Operation *CIRGenNVCUDARuntime::getKernelHandle(cir::FuncOp fn, } // Create a new CIR global variable to represent the kernel handle - auto &builder = cgm.getBuilder(); - auto globalName = cgm.getMangledName( - GD.getWithKernelReferenceKind(KernelReferenceKind::Kernel)); - const VarDecl *varDecl = llvm::dyn_cast_or_null<VarDecl>(GD.getDecl()); + CIRGenBuilderTy &builder = cgm.getBuilder(); + StringRef globalName = cgm.getMangledName( + gd.getWithKernelReferenceKind(KernelReferenceKind::Kernel)); + const VarDecl *varDecl = llvm::dyn_cast_or_null<VarDecl>(gd.getDecl()); cir::GlobalOp globalOp = cgm.getOrCreateCIRGlobal(globalName, fn.getFunctionType().getReturnType(), LangAS::Default, varDecl, NotForDefinition); diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp index 8cef5408bbfc1..6ce66922deb0f 100644 --- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp @@ -70,8 +70,7 @@ CIRGenModule::CIRGenModule(mlir::MLIRContext &mlirContext, langOpts(astContext.getLangOpts()), codeGenOpts(cgo), theModule{mlir::ModuleOp::create(mlir::UnknownLoc::get(&mlirContext))}, diags(diags), target(astContext.getTargetInfo()), - abi(createCXXABI(*this)), genTypes(*this), vtables(*this), - cudaRuntime(clang::CIRGen::createNVCUDARuntime((*this))) { + abi(createCXXABI(*this)), genTypes(*this), vtables(*this) { // Initialize cached types voidTy = cir::VoidType::get(&getMLIRContext()); @@ -129,6 +128,10 @@ CIRGenModule::CIRGenModule(mlir::MLIRContext &mlirContext, cir::OptInfoAttr::get(&mlirContext, cgo.OptimizationLevel, cgo.OptimizeSize)); + + if (langOpts.CUDA) + createCUDARuntime(); + // Set the module name to be the name of the main file. TranslationUnitDecl // often contains invalid source locations and isn't a reliable source for the // module location. @@ -146,6 +149,10 @@ CIRGenModule::CIRGenModule(mlir::MLIRContext &mlirContext, CIRGenModule::~CIRGenModule() = default; +void CIRGenModule::createCUDARuntime() { + cudaRuntime.reset(createNVCUDARuntime(*this)); +} + /// FIXME: this could likely be a common helper and not necessarily related /// with codegen. /// Return the best known alignment for an unknown pointer to a diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.h b/clang/lib/CIR/CodeGen/CIRGenModule.h index 9b12a5fe26e04..1c2d2f8277fa8 100644 --- a/clang/lib/CIR/CodeGen/CIRGenModule.h +++ b/clang/lib/CIR/CodeGen/CIRGenModule.h @@ -100,6 +100,8 @@ class CIRGenModule : public CIRGenTypeCache { llvm::SmallVector<mlir::Attribute> globalScopeAsm; + void createCUDARuntime(); + public: mlir::ModuleOp getModule() const { return theModule; } CIRGenBuilderTy &getBuilder() { return builder; } diff --git a/clang/test/CIR/CodeGen/CUDA/kernel-call.cu b/clang/test/CIR/CodeGen/CUDA/kernel-call.cu index d1dae134b0230..2fca96fe3926c 100644 --- a/clang/test/CIR/CodeGen/CUDA/kernel-call.cu +++ b/clang/test/CIR/CodeGen/CUDA/kernel-call.cu @@ -12,7 +12,39 @@ // TODO: Test CUDA legacy (< 9.0) when legacy stub body is implemented // TODO: Test HIP when HIP stub body support is complete -// CUDA-NEW-LABEL: cir.func {{.*}} @_Z21__device_stub__kernelv -// CUDA-NEW: cir.call @__cudaPopCallConfiguration -// CUDA-NEW: cir.call @cudaLaunchKernel -__global__ void kernel() {} +// Check that the stub function is generated with the correct name +// CUDA-NEW-LABEL: cir.func {{.*}} @_Z21__device_stub__kernelif +// +// Check kernel arguments are allocated as local variables +// CUDA-NEW-DAG: cir.alloca !s32i, {{.*}} ["x", init] +// CUDA-NEW-DAG: cir.alloca !cir.float, {{.*}} ["y", init] +// +// Check void *args[] array is created with correct size (2 args) +// CUDA-NEW: cir.alloca !cir.array<!cir.ptr<!void> x 2>, {{.*}} ["kernel_args"] +// CUDA-NEW: cir.cast array_to_ptrdecay +// +// Check arguments are stored in the args array via ptr_stride indexing +// CUDA-NEW: cir.const #cir.int<0> +// CUDA-NEW: cir.ptr_stride +// CUDA-NEW: cir.cast bitcast {{.*}} -> !cir.ptr<!void> +// CUDA-NEW: cir.store {{.*}} !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>> +// CUDA-NEW: cir.const #cir.int<1> +// CUDA-NEW: cir.ptr_stride +// CUDA-NEW: cir.cast bitcast {{.*}} -> !cir.ptr<!void> +// CUDA-NEW: cir.store {{.*}} !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>> +// +// Check dim3 grid_dim and block_dim allocas for launch configuration +// CUDA-NEW-DAG: cir.alloca !rec_dim3, {{.*}} ["grid_dim"] +// CUDA-NEW-DAG: cir.alloca !rec_dim3, {{.*}} ["block_dim"] +// +// Check shared_mem (size_t) and stream allocas +// CUDA-NEW-DAG: cir.alloca !u64i, {{.*}} ["shared_mem"] +// CUDA-NEW-DAG: cir.alloca !cir.ptr<!rec_cudaStream>, {{.*}} ["stream"] +// +// Check __cudaPopCallConfiguration is called with correct argument types +// CUDA-NEW: cir.call @__cudaPopCallConfiguration({{.*}}) : (!cir.ptr<!rec_dim3>, !cir.ptr<!rec_dim3>, !cir.ptr<!u64i>, !cir.ptr<!cir.ptr<!rec_cudaStream>>) -> !s32i +// +// Check cudaLaunchKernel is called with all 6 arguments: +// func ptr, gridDim, blockDim, args, sharedMem, stream +// CUDA-NEW: cir.call @cudaLaunchKernel({{.*}}) : (!cir.ptr<!void>, !rec_dim3, !rec_dim3, !cir.ptr<!cir.ptr<!void>>, !u64i, !cir.ptr<!rec_cudaStream>) -> !u32i +__global__ void kernel(int x, float y) {} _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
