https://github.com/RiverDave created https://github.com/llvm/llvm-project/pull/199270
None >From d7dc3fc910f263519afd2a76c5981c09a560cdb9 Mon Sep 17 00:00:00 2001 From: David Rivera <[email protected]> Date: Fri, 22 May 2026 15:51:20 -0400 Subject: [PATCH] [CIR][CUDA] Emit global var registration --- clang/include/clang/CIR/MissingFeatures.h | 1 - .../Dialect/Transforms/LoweringPrepare.cpp | 99 +++++++++++++++++-- clang/test/CIR/CodeGenCUDA/device-stub.cu | 49 +++++++-- 3 files changed, 136 insertions(+), 13 deletions(-) diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h index 8af09c5007495..ddcfbf5080e6f 100644 --- a/clang/include/clang/CIR/MissingFeatures.h +++ b/clang/include/clang/CIR/MissingFeatures.h @@ -219,7 +219,6 @@ struct MissingFeatures { static bool ctorMemcpyizer() { return false; } static bool cudaSupport() { return false; } static bool hipModuleCtor() { return false; } - static bool globalRegistration() { return false; } static bool dataLayoutTypeAllocSize() { return false; } static bool dataLayoutPtrHandlingBasedOnLangAS() { return false; } static bool deferredCXXGlobalInit() { return false; } diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp index b910ca3c8286c..27900987fac68 100644 --- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp +++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp @@ -165,6 +165,8 @@ struct LoweringPreparePass /// ------------ llvm::StringMap<FuncOp> cudaKernelMap; + llvm::SmallVector<std::pair<cir::GlobalOp, cir::CUDAVarRegistrationInfoAttr>> + cudaDeviceVars; /// Build the CUDA module constructor that registers the fat binary /// with the CUDA runtime. @@ -172,6 +174,8 @@ struct LoweringPreparePass std::optional<FuncOp> buildCUDAModuleDtor(); std::optional<FuncOp> buildHIPModuleDtor(); std::optional<FuncOp> buildCUDARegisterGlobals(); + void buildCUDARegisterVars(cir::CIRBaseBuilderTy &builder, + FuncOp regGlobalFunc); void buildCUDARegisterGlobalFunctions(cir::CIRBaseBuilderTy &builder, FuncOp regGlobalFunc); @@ -2239,6 +2243,9 @@ void LoweringPreparePass::runOnOp(mlir::Operation *op) { lowerComplexMulOp(complexMul); } else if (auto glob = mlir::dyn_cast<cir::GlobalOp>(op)) { lowerGlobalOp(glob); + if (auto regAttr = glob->getAttrOfType<CUDAVarRegistrationInfoAttr>( + CUDAVarRegistrationInfoAttr::getMnemonic())) + cudaDeviceVars.emplace_back(glob, regAttr); } else if (auto getGlob = mlir::dyn_cast<cir::GetGlobalOp>(op)) { lowerGetGlobalOp(getGlob); } else if (auto unaryOp = mlir::dyn_cast<cir::UnaryOpInterface>(op)) { @@ -2303,8 +2310,8 @@ void LoweringPreparePass::buildCUDAModuleCtor() { llvm_unreachable("GPU RDC NYI"); // For CUDA without -fgpu-rdc, it's safe to stop generating ctor - // if there's nothing to register. - if (cudaKernelMap.empty()) + // if there's nothing to register. + if (cudaKernelMap.empty() && cudaDeviceVars.empty()) return; // There's no device-side binary, so no need to proceed for CUDA. @@ -2658,8 +2665,7 @@ std::optional<FuncOp> LoweringPreparePass::buildHIPModuleDtor() { } std::optional<FuncOp> LoweringPreparePass::buildCUDARegisterGlobals() { - // There is nothing to register. - if (cudaKernelMap.empty()) + if (cudaKernelMap.empty() && cudaDeviceVars.empty()) return {}; cir::CIRBaseBuilderTy builder(getContext()); @@ -2683,8 +2689,7 @@ std::optional<FuncOp> LoweringPreparePass::buildCUDARegisterGlobals() { builder.setInsertionPointToStart(regGlobalFunc.addEntryBlock()); buildCUDARegisterGlobalFunctions(builder, regGlobalFunc); - // TODO: Handle shadow registration - assert(!cir::MissingFeatures::globalRegistration()); + buildCUDARegisterVars(builder, regGlobalFunc); ReturnOp::create(builder, loc); return regGlobalFunc; @@ -2774,6 +2779,88 @@ void LoweringPreparePass::buildCUDARegisterGlobalFunctions( } } +// Emit `__{cuda|hip}RegisterVar` calls inside `__{cuda|hip}_register_globals` +// for every device-side shadow that carries a `cu.var_registration` attribute +// (attached by `CIRGenNVCUDARuntime::handleVarRegistration`). +void LoweringPreparePass::buildCUDARegisterVars(cir::CIRBaseBuilderTy &builder, + FuncOp regGlobalFunc) { + mlir::Location loc = mlirModule.getLoc(); + llvm::StringRef cudaPrefix = getCUDAPrefix(astCtx); + cir::CIRDataLayout dataLayout(mlirModule); + + PointerType voidPtrTy = builder.getVoidPtrTy(); + PointerType voidPtrPtrTy = builder.getPointerTo(voidPtrTy); + IntType intTy = builder.getSIntNTy(32); + IntType sizeTy = + builder.getUIntNTy(astCtx->getTargetInfo().getMaxPointerWidth()); + IntType charTy = cir::IntType::get(&getContext(), astCtx->getCharWidth(), + /*isSigned=*/false); + + if (cudaDeviceVars.empty()) + return; + + cir::CIRBaseBuilderTy globalBuilder(getContext()); + globalBuilder.setInsertionPointToStart(mlirModule.getBody()); + + // void __{cuda|hip}RegisterVar(void **fatbinHandle, + // char *hostVar, char *deviceAddress, + // const char *deviceName, int ext, + // size_t size, int constant, int normalized); + // OG ignores parameter types, treating pointers as void*. + cir::VoidType voidTy = builder.getVoidTy(); + FuncOp cudaRegisterVar = buildRuntimeFunction( + globalBuilder, addUnderscoredPrefix(cudaPrefix, "RegisterVar"), loc, + FuncType::get({voidPtrPtrTy, voidPtrTy, voidPtrTy, voidPtrTy, intTy, + sizeTy, intTy, intTy}, + voidTy)); + + auto makeConstantString = [&](llvm::StringRef str) -> GlobalOp { + auto strType = ArrayType::get(&getContext(), charTy, 1 + str.size()); + auto tmpString = cir::GlobalOp::create( + globalBuilder, loc, (".str" + str).str(), strType, + /*isConstant=*/true, {}, + /*linkage=*/cir::GlobalLinkageKind::PrivateLinkage); + tmpString.setInitialValueAttr( + ConstArrayAttr::get(strType, StringAttr::get(str + "\0", strType))); + tmpString.setPrivate(); + return tmpString; + }; + + mlir::Value fatbinHandle = *regGlobalFunc.args_begin(); + + for (auto &[global, regAttr] : cudaDeviceVars) { + switch (regAttr.getKind()) { + case cir::CUDADeviceVarKind::Variable: + break; + case cir::CUDADeviceVarKind::Surface: + llvm_unreachable("Surface registration NYI"); + case cir::CUDADeviceVarKind::Texture: + llvm_unreachable("Texture registration NYI"); + } + + if (regAttr.getIsManaged()) + llvm_unreachable("Managed variable registration NYI"); + + GlobalOp deviceNameStr = makeConstantString(regAttr.getDeviceSideName()); + mlir::Value deviceName = builder.createBitcast( + builder.createGetGlobal(deviceNameStr), voidPtrTy); + mlir::Value hostVar = + builder.createBitcast(builder.createGetGlobal(global), voidPtrTy); + + auto isExtern = ConstantOp::create( + builder, loc, IntAttr::get(intTy, regAttr.getIsExtern() ? 1 : 0)); + llvm::TypeSize size = dataLayout.getTypeSizeInBits(global.getSymType()); + auto varSize = ConstantOp::create( + builder, loc, IntAttr::get(sizeTy, size.getFixedValue() / 8)); + auto isConstant = ConstantOp::create( + builder, loc, IntAttr::get(intTy, regAttr.getIsConstant() ? 1 : 0)); + auto normalized = ConstantOp::create(builder, loc, IntAttr::get(intTy, 0)); + builder.createCallOp(loc, cudaRegisterVar, + {fatbinHandle, hostVar, deviceName, deviceName, + isExtern, varSize, isConstant, normalized}); + } +} + void LoweringPreparePass::runOnOperation() { mlir::Operation *op = getOperation(); if (isa<::mlir::ModuleOp>(op)) diff --git a/clang/test/CIR/CodeGenCUDA/device-stub.cu b/clang/test/CIR/CodeGenCUDA/device-stub.cu index b635f5931df77..6517143f72aad 100644 --- a/clang/test/CIR/CodeGenCUDA/device-stub.cu +++ b/clang/test/CIR/CodeGenCUDA/device-stub.cu @@ -41,6 +41,11 @@ __global__ void kernelfunc(int i, int j, int k) {} void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); } +// Device-side shadows: exercise the __cudaRegisterVar code path alongside the +// existing __cudaRegisterFunction kernel registration. +__device__ int a; +__constant__ int b; + // Check module constructor is registered in module attributes. // CIR: cir.global_ctors = [#cir.global_ctor<"__cuda_module_ctor", 65535>] @@ -57,6 +62,13 @@ void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); } // CIR: cir.func private @__cudaRegisterFatBinaryEnd(!cir.ptr<!cir.ptr<!void>>) +// __cudaRegisterVar runtime declaration and per-variable name strings for +// device shadows. These are emitted between __cudaRegisterFatBinaryEnd and +// __cudaRegisterFunction; relative order is not significant. +// CIR-DAG: cir.func private @__cudaRegisterVar(!cir.ptr<!cir.ptr<!void>>, !cir.ptr<!void>, !cir.ptr<!void>, !cir.ptr<!void>, !s32i, !u64i, !s32i, !s32i) +// CIR-DAG: cir.global "private" constant cir_private @".stra" = #cir.const_array<"a" : !cir.array<!u8i x 2>, trailing_zeros> +// CIR-DAG: cir.global "private" constant cir_private @".strb" = #cir.const_array<"b" : !cir.array<!u8i x 2>, trailing_zeros> + // Check the __cudaRegisterFunction runtime declaration: // int __cudaRegisterFunction(void**, void*, void*, void*, int, // void*, void*, void*, void*, void*) @@ -65,8 +77,9 @@ void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); } // Check the device-side name string for kernelfunc (mangled, null-terminated). // CIR: cir.global "private" constant cir_private @".str_Z10kernelfunciii" = #cir.const_array<"_Z10kernelfunciii" : !cir.array<!u8i x 18>, trailing_zeros> : !cir.array<!u8i x 18> -// Check __cuda_register_globals body: one __cudaRegisterFunction call per kernel. -// CIR: cir.func internal private @__cuda_register_globals(%arg0: !cir.ptr<!cir.ptr<!void>> +// Check __cuda_register_globals body: __cudaRegisterFunction for each kernel, +// then __cudaRegisterVar for each device shadow. +// CIR: cir.func internal private @__cuda_register_globals(%[[FATBIN:.*]]: !cir.ptr<!cir.ptr<!void>> // CIR-NEXT: %[[NULL:.*]] = cir.const #cir.ptr<null> : !cir.ptr<!void> // CIR-NEXT: %[[STR_ADDR:.*]] = cir.get_global @".str_Z10kernelfunciii" // CIR-NEXT: %[[DEVICE_FUNC:.*]] = cir.cast bitcast %[[STR_ADDR]] @@ -74,6 +87,26 @@ void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); } // CIR-NEXT: %[[HOST_FUNC:.*]] = cir.cast bitcast %[[HOST_FUNC_RAW]] // CIR-NEXT: %[[THREAD_LIMIT:.*]] = cir.const #cir.int<-1> : !s32i // CIR-NEXT: cir.call @__cudaRegisterFunction(%{{.*}}, %[[HOST_FUNC]], %[[DEVICE_FUNC]], %[[DEVICE_FUNC]], %[[THREAD_LIMIT]], %[[NULL]], %[[NULL]], %[[NULL]], %[[NULL]], %[[NULL]]) +// Registration for __device__ int a (constant=0): +// CIR: %[[#NAMEA_RAW:]] = cir.get_global @".stra" +// CIR-NEXT: %[[#NAMEA:]] = cir.cast bitcast %[[#NAMEA_RAW]] +// CIR-NEXT: %[[#HOSTA_RAW:]] = cir.get_global @a +// CIR-NEXT: %[[#HOSTA:]] = cir.cast bitcast %[[#HOSTA_RAW]] +// CIR-NEXT: %[[#EXTA:]] = cir.const #cir.int<0> : !s32i +// CIR-NEXT: %[[#SZA:]] = cir.const #cir.int<4> : !u64i +// CIR-NEXT: %[[#CONA:]] = cir.const #cir.int<0> : !s32i +// CIR-NEXT: %[[#NORMA:]] = cir.const #cir.int<0> : !s32i +// CIR-NEXT: cir.call @__cudaRegisterVar(%[[FATBIN]], %[[#HOSTA]], %[[#NAMEA]], %[[#NAMEA]], %[[#EXTA]], %[[#SZA]], %[[#CONA]], %[[#NORMA]]) +// Registration for __constant__ int b (constant=1): +// CIR: %[[#NAMEB_RAW:]] = cir.get_global @".strb" +// CIR-NEXT: %[[#NAMEB:]] = cir.cast bitcast %[[#NAMEB_RAW]] +// CIR-NEXT: %[[#HOSTB_RAW:]] = cir.get_global @b +// CIR-NEXT: %[[#HOSTB:]] = cir.cast bitcast %[[#HOSTB_RAW]] +// CIR-NEXT: %[[#EXTB:]] = cir.const #cir.int<0> : !s32i +// CIR-NEXT: %[[#SZB:]] = cir.const #cir.int<4> : !u64i +// CIR-NEXT: %[[#CONB:]] = cir.const #cir.int<1> : !s32i +// CIR-NEXT: %[[#NORMB:]] = cir.const #cir.int<0> : !s32i +// CIR-NEXT: cir.call @__cudaRegisterVar(%[[FATBIN]], %[[#HOSTB]], %[[#NAMEB]], %[[#NAMEB]], %[[#EXTB]], %[[#SZB]], %[[#CONB]], %[[#NORMB]]) // CIR-NEXT: cir.return // CIR: cir.global "private" constant cir_private @__cuda_fatbin_str = #cir.const_array<"GPU binary would be here." : !cir.array<!u8i x 25>> : !cir.array<!u8i x 25> {alignment = 8 : i64, section = ".nv_fatbin"} @@ -111,8 +144,10 @@ void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); } // OGCG: @__cuda_gpubin_handle = internal global ptr null // OGCG: @llvm.global_ctors = appending global {{.*}}@__cuda_module_ctor -// OGCG: define internal void @__cuda_register_globals -// OGCG: call{{.*}}__cudaRegisterFunction(ptr %0, {{.*}}kernelfunc{{.*}}, ptr @0 +// OGCG: define internal void @__cuda_register_globals(ptr %[[#OGFATBIN:]]) +// OGCG: call{{.*}}__cudaRegisterFunction(ptr %[[#OGFATBIN]], {{.*}}kernelfunc{{.*}} +// OGCG: call void @__cudaRegisterVar(ptr %[[#OGFATBIN]], ptr @a, {{.*}}, {{.*}}, i32 0, i64 4, i32 0, i32 0) +// OGCG: call void @__cudaRegisterVar(ptr %[[#OGFATBIN]], ptr @b, {{.*}}, {{.*}}, i32 0, i64 4, i32 1, i32 0) // OGCG: ret void // OGCG: define internal void @__cuda_module_ctor @@ -134,8 +169,10 @@ void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); } // LLVM: load ptr, ptr @__cuda_gpubin_handle // LLVM: call void @__cudaUnregisterFatBinary -// LLVM: define internal void @__cuda_register_globals -// LLVM: call{{.*}}@__cudaRegisterFunction(ptr %{{.*}}, ptr @{{.*}}kernelfunc{{.*}}, ptr @{{.*}}, ptr @{{.*}}, i32 -1, ptr null, ptr null, ptr null, ptr null, ptr null) +// LLVM: define internal void @__cuda_register_globals(ptr %[[#FATBIN:]]) +// LLVM: call{{.*}}@__cudaRegisterFunction(ptr %[[#FATBIN]], ptr @{{.*}}kernelfunc{{.*}}, ptr @{{.*}}, ptr @{{.*}}, i32 -1, ptr null, ptr null, ptr null, ptr null, ptr null) +// LLVM: call void @__cudaRegisterVar(ptr %[[#FATBIN]], ptr @a, ptr @.stra, ptr @.stra, i32 0, i64 4, i32 0, i32 0) +// LLVM: call void @__cudaRegisterVar(ptr %[[#FATBIN]], ptr @b, ptr @.strb, ptr @.strb, i32 0, i64 4, i32 1, i32 0) // LLVM: ret void // LLVM: define internal void @__cuda_module_ctor _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
