llvmorg-github-actions[bot] wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-clangir
Author: David Rivera (RiverDave)
<details>
<summary>Changes</summary>
Related: #<!-- -->190087
Consumes `cu.var_registration` from #<!-- -->190087: emits
`__{cuda|hip}RegisterVar` calls in `__{cuda|hip}_register_globals` for each
device-side shadow.
Generated sequence for a TU with `__device__ int a;` and `__constant__ int b;`:
```c
void __cuda_register_globals(void **fatbin) {
__cudaRegisterFunction(fatbin, &kernelfunc, ".str_Z10...",
".str_Z10...", -1,
nullptr, nullptr, nullptr, nullptr, nullptr);
__cudaRegisterVar(fatbin, &a, ".stra", ".stra",
/*ext=*/0, /*size=*/4, /*constant=*/0, /*normalized=*/0);
__cudaRegisterVar(fatbin, &b, ".strb", ".strb",
/*ext=*/0, /*size=*/4, /*constant=*/1, /*normalized=*/0);
}
```
---
Full diff: https://github.com/llvm/llvm-project/pull/199270.diff
3 Files Affected:
- (modified) clang/include/clang/CIR/MissingFeatures.h (-1)
- (modified) clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp (+92-5)
- (modified) clang/test/CIR/CodeGenCUDA/device-stub.cu (+43-6)
``````````diff
diff --git a/clang/include/clang/CIR/MissingFeatures.h
b/clang/include/clang/CIR/MissingFeatures.h
index 8af09c5007495..ddcfbf5080e6f 100644
--- a/clang/include/clang/CIR/MissingFeatures.h
+++ b/clang/include/clang/CIR/MissingFeatures.h
@@ -219,7 +219,6 @@ struct MissingFeatures {
static bool ctorMemcpyizer() { return false; }
static bool cudaSupport() { return false; }
static bool hipModuleCtor() { return false; }
- static bool globalRegistration() { return false; }
static bool dataLayoutTypeAllocSize() { return false; }
static bool dataLayoutPtrHandlingBasedOnLangAS() { return false; }
static bool deferredCXXGlobalInit() { return false; }
diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
index b910ca3c8286c..3833a81abcb70 100644
--- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
@@ -165,6 +165,8 @@ struct LoweringPreparePass
/// ------------
llvm::StringMap<FuncOp> cudaKernelMap;
+ llvm::SmallVector<std::pair<cir::GlobalOp, cir::CUDAVarRegistrationInfoAttr>>
+ cudaDeviceVars;
/// Build the CUDA module constructor that registers the fat binary
/// with the CUDA runtime.
@@ -172,6 +174,8 @@ struct LoweringPreparePass
std::optional<FuncOp> buildCUDAModuleDtor();
std::optional<FuncOp> buildHIPModuleDtor();
std::optional<FuncOp> buildCUDARegisterGlobals();
+ void buildCUDARegisterVars(cir::CIRBaseBuilderTy &builder,
+ FuncOp regGlobalFunc);
void buildCUDARegisterGlobalFunctions(cir::CIRBaseBuilderTy &builder,
FuncOp regGlobalFunc);
@@ -2239,6 +2243,9 @@ void LoweringPreparePass::runOnOp(mlir::Operation *op) {
lowerComplexMulOp(complexMul);
} else if (auto glob = mlir::dyn_cast<cir::GlobalOp>(op)) {
lowerGlobalOp(glob);
+ if (auto regAttr = glob->getAttrOfType<CUDAVarRegistrationInfoAttr>(
+ CUDAVarRegistrationInfoAttr::getMnemonic()))
+ cudaDeviceVars.emplace_back(glob, regAttr);
} else if (auto getGlob = mlir::dyn_cast<cir::GetGlobalOp>(op)) {
lowerGetGlobalOp(getGlob);
} else if (auto unaryOp = mlir::dyn_cast<cir::UnaryOpInterface>(op)) {
@@ -2304,7 +2311,7 @@ void LoweringPreparePass::buildCUDAModuleCtor() {
// For CUDA without -fgpu-rdc, it's safe to stop generating ctor
// if there's nothing to register.
- if (cudaKernelMap.empty())
+ if (cudaKernelMap.empty() && cudaDeviceVars.empty())
return;
// There's no device-side binary, so no need to proceed for CUDA.
@@ -2658,8 +2665,7 @@ std::optional<FuncOp>
LoweringPreparePass::buildHIPModuleDtor() {
}
std::optional<FuncOp> LoweringPreparePass::buildCUDARegisterGlobals() {
- // There is nothing to register.
- if (cudaKernelMap.empty())
+ if (cudaKernelMap.empty() && cudaDeviceVars.empty())
return {};
cir::CIRBaseBuilderTy builder(getContext());
@@ -2683,8 +2689,7 @@ std::optional<FuncOp>
LoweringPreparePass::buildCUDARegisterGlobals() {
builder.setInsertionPointToStart(regGlobalFunc.addEntryBlock());
buildCUDARegisterGlobalFunctions(builder, regGlobalFunc);
- // TODO: Handle shadow registration
- assert(!cir::MissingFeatures::globalRegistration());
+ buildCUDARegisterVars(builder, regGlobalFunc);
ReturnOp::create(builder, loc);
return regGlobalFunc;
@@ -2774,6 +2779,88 @@ void
LoweringPreparePass::buildCUDARegisterGlobalFunctions(
}
}
+// Emit `__{cuda|hip}RegisterVar` calls inside `__{cuda|hip}_register_globals`
+// for every device-side shadow that carries a `cu.var_registration` attribute
+// (attached by `CIRGenNVCUDARuntime::handleVarRegistration`).
+void LoweringPreparePass::buildCUDARegisterVars(cir::CIRBaseBuilderTy &builder,
+ FuncOp regGlobalFunc) {
+ mlir::Location loc = mlirModule.getLoc();
+ llvm::StringRef cudaPrefix = getCUDAPrefix(astCtx);
+ cir::CIRDataLayout dataLayout(mlirModule);
+
+ PointerType voidPtrTy = builder.getVoidPtrTy();
+ PointerType voidPtrPtrTy = builder.getPointerTo(voidPtrTy);
+ IntType intTy = builder.getSIntNTy(32);
+ IntType sizeTy =
+ builder.getUIntNTy(astCtx->getTargetInfo().getMaxPointerWidth());
+ IntType charTy = cir::IntType::get(&getContext(), astCtx->getCharWidth(),
+ /*isSigned=*/false);
+
+ if (cudaDeviceVars.empty())
+ return;
+
+ cir::CIRBaseBuilderTy globalBuilder(getContext());
+ globalBuilder.setInsertionPointToStart(mlirModule.getBody());
+
+ // void __{cuda|hip}RegisterVar(void **fatbinHandle,
+ // char *hostVar, char *deviceAddress,
+ // const char *deviceName, int ext,
+ // size_t size, int constant, int normalized);
+ // OG ignores parameter types, treating pointers as void*.
+ cir::VoidType voidTy = builder.getVoidTy();
+ FuncOp cudaRegisterVar = buildRuntimeFunction(
+ globalBuilder, addUnderscoredPrefix(cudaPrefix, "RegisterVar"), loc,
+ FuncType::get({voidPtrPtrTy, voidPtrTy, voidPtrTy, voidPtrTy, intTy,
+ sizeTy, intTy, intTy},
+ voidTy));
+
+ auto makeConstantString = [&](llvm::StringRef str) -> GlobalOp {
+ auto strType = ArrayType::get(&getContext(), charTy, 1 + str.size());
+ auto tmpString = cir::GlobalOp::create(
+ globalBuilder, loc, (".str" + str).str(), strType,
+ /*isConstant=*/true, {},
+ /*linkage=*/cir::GlobalLinkageKind::PrivateLinkage);
+ tmpString.setInitialValueAttr(
+ ConstArrayAttr::get(strType, StringAttr::get(str + "\0", strType)));
+ tmpString.setPrivate();
+ return tmpString;
+ };
+
+ mlir::Value fatbinHandle = *regGlobalFunc.args_begin();
+
+ for (auto &[global, regAttr] : cudaDeviceVars) {
+ switch (regAttr.getKind()) {
+ case cir::CUDADeviceVarKind::Variable:
+ break;
+ case cir::CUDADeviceVarKind::Surface:
+ llvm_unreachable("Surface registration NYI");
+ case cir::CUDADeviceVarKind::Texture:
+ llvm_unreachable("Texture registration NYI");
+ }
+
+ if (regAttr.getIsManaged())
+ llvm_unreachable("Managed variable registration NYI");
+
+ GlobalOp deviceNameStr = makeConstantString(regAttr.getDeviceSideName());
+ mlir::Value deviceName = builder.createBitcast(
+ builder.createGetGlobal(deviceNameStr), voidPtrTy);
+ mlir::Value hostVar =
+ builder.createBitcast(builder.createGetGlobal(global), voidPtrTy);
+
+ auto isExtern = ConstantOp::create(
+ builder, loc, IntAttr::get(intTy, regAttr.getIsExtern() ? 1 : 0));
+ llvm::TypeSize size = dataLayout.getTypeSizeInBits(global.getSymType());
+ auto varSize = ConstantOp::create(
+ builder, loc, IntAttr::get(sizeTy, size.getFixedValue() / 8));
+ auto isConstant = ConstantOp::create(
+ builder, loc, IntAttr::get(intTy, regAttr.getIsConstant() ? 1 : 0));
+ auto normalized = ConstantOp::create(builder, loc, IntAttr::get(intTy, 0));
+ builder.createCallOp(loc, cudaRegisterVar,
+ {fatbinHandle, hostVar, deviceName, deviceName,
+ isExtern, varSize, isConstant, normalized});
+ }
+}
+
void LoweringPreparePass::runOnOperation() {
mlir::Operation *op = getOperation();
if (isa<::mlir::ModuleOp>(op))
diff --git a/clang/test/CIR/CodeGenCUDA/device-stub.cu
b/clang/test/CIR/CodeGenCUDA/device-stub.cu
index b635f5931df77..6517143f72aad 100644
--- a/clang/test/CIR/CodeGenCUDA/device-stub.cu
+++ b/clang/test/CIR/CodeGenCUDA/device-stub.cu
@@ -41,6 +41,11 @@ __global__ void kernelfunc(int i, int j, int k) {}
void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); }
+// Device-side shadows: exercise the __cudaRegisterVar code path alongside the
+// existing __cudaRegisterFunction kernel registration.
+__device__ int a;
+__constant__ int b;
+
// Check module constructor is registered in module attributes.
// CIR: cir.global_ctors = [#cir.global_ctor<"__cuda_module_ctor", 65535>]
@@ -57,6 +62,13 @@ void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); }
// CIR: cir.func private @__cudaRegisterFatBinaryEnd(!cir.ptr<!cir.ptr<!void>>)
+// __cudaRegisterVar runtime declaration and per-variable name strings for
+// device shadows. These are emitted between __cudaRegisterFatBinaryEnd and
+// __cudaRegisterFunction; relative order is not significant.
+// CIR-DAG: cir.func private @__cudaRegisterVar(!cir.ptr<!cir.ptr<!void>>,
!cir.ptr<!void>, !cir.ptr<!void>, !cir.ptr<!void>, !s32i, !u64i, !s32i, !s32i)
+// CIR-DAG: cir.global "private" constant cir_private @".stra" =
#cir.const_array<"a" : !cir.array<!u8i x 2>, trailing_zeros>
+// CIR-DAG: cir.global "private" constant cir_private @".strb" =
#cir.const_array<"b" : !cir.array<!u8i x 2>, trailing_zeros>
+
// Check the __cudaRegisterFunction runtime declaration:
// int __cudaRegisterFunction(void**, void*, void*, void*, int,
// void*, void*, void*, void*, void*)
@@ -65,8 +77,9 @@ void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); }
// Check the device-side name string for kernelfunc (mangled, null-terminated).
// CIR: cir.global "private" constant cir_private @".str_Z10kernelfunciii" =
#cir.const_array<"_Z10kernelfunciii" : !cir.array<!u8i x 18>, trailing_zeros> :
!cir.array<!u8i x 18>
-// Check __cuda_register_globals body: one __cudaRegisterFunction call per
kernel.
-// CIR: cir.func internal private @__cuda_register_globals(%arg0:
!cir.ptr<!cir.ptr<!void>>
+// Check __cuda_register_globals body: __cudaRegisterFunction for each kernel,
+// then __cudaRegisterVar for each device shadow.
+// CIR: cir.func internal private @__cuda_register_globals(%[[FATBIN:.*]]:
!cir.ptr<!cir.ptr<!void>>
// CIR-NEXT: %[[NULL:.*]] = cir.const #cir.ptr<null> : !cir.ptr<!void>
// CIR-NEXT: %[[STR_ADDR:.*]] = cir.get_global @".str_Z10kernelfunciii"
// CIR-NEXT: %[[DEVICE_FUNC:.*]] = cir.cast bitcast %[[STR_ADDR]]
@@ -74,6 +87,26 @@ void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); }
// CIR-NEXT: %[[HOST_FUNC:.*]] = cir.cast bitcast %[[HOST_FUNC_RAW]]
// CIR-NEXT: %[[THREAD_LIMIT:.*]] = cir.const #cir.int<-1> : !s32i
// CIR-NEXT: cir.call @__cudaRegisterFunction(%{{.*}}, %[[HOST_FUNC]],
%[[DEVICE_FUNC]], %[[DEVICE_FUNC]], %[[THREAD_LIMIT]], %[[NULL]], %[[NULL]],
%[[NULL]], %[[NULL]], %[[NULL]])
+// Registration for __device__ int a (constant=0):
+// CIR: %[[#NAMEA_RAW:]] = cir.get_global @".stra"
+// CIR-NEXT: %[[#NAMEA:]] = cir.cast bitcast %[[#NAMEA_RAW]]
+// CIR-NEXT: %[[#HOSTA_RAW:]] = cir.get_global @a
+// CIR-NEXT: %[[#HOSTA:]] = cir.cast bitcast %[[#HOSTA_RAW]]
+// CIR-NEXT: %[[#EXTA:]] = cir.const #cir.int<0> : !s32i
+// CIR-NEXT: %[[#SZA:]] = cir.const #cir.int<4> : !u64i
+// CIR-NEXT: %[[#CONA:]] = cir.const #cir.int<0> : !s32i
+// CIR-NEXT: %[[#NORMA:]] = cir.const #cir.int<0> : !s32i
+// CIR-NEXT: cir.call @__cudaRegisterVar(%[[FATBIN]], %[[#HOSTA]],
%[[#NAMEA]], %[[#NAMEA]], %[[#EXTA]], %[[#SZA]], %[[#CONA]], %[[#NORMA]])
+// Registration for __constant__ int b (constant=1):
+// CIR: %[[#NAMEB_RAW:]] = cir.get_global @".strb"
+// CIR-NEXT: %[[#NAMEB:]] = cir.cast bitcast %[[#NAMEB_RAW]]
+// CIR-NEXT: %[[#HOSTB_RAW:]] = cir.get_global @b
+// CIR-NEXT: %[[#HOSTB:]] = cir.cast bitcast %[[#HOSTB_RAW]]
+// CIR-NEXT: %[[#EXTB:]] = cir.const #cir.int<0> : !s32i
+// CIR-NEXT: %[[#SZB:]] = cir.const #cir.int<4> : !u64i
+// CIR-NEXT: %[[#CONB:]] = cir.const #cir.int<1> : !s32i
+// CIR-NEXT: %[[#NORMB:]] = cir.const #cir.int<0> : !s32i
+// CIR-NEXT: cir.call @__cudaRegisterVar(%[[FATBIN]], %[[#HOSTB]],
%[[#NAMEB]], %[[#NAMEB]], %[[#EXTB]], %[[#SZB]], %[[#CONB]], %[[#NORMB]])
// CIR-NEXT: cir.return
// CIR: cir.global "private" constant cir_private @__cuda_fatbin_str =
#cir.const_array<"GPU binary would be here." : !cir.array<!u8i x 25>> :
!cir.array<!u8i x 25> {alignment = 8 : i64, section = ".nv_fatbin"}
@@ -111,8 +144,10 @@ void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); }
// OGCG: @__cuda_gpubin_handle = internal global ptr null
// OGCG: @llvm.global_ctors = appending global {{.*}}@__cuda_module_ctor
-// OGCG: define internal void @__cuda_register_globals
-// OGCG: call{{.*}}__cudaRegisterFunction(ptr %0, {{.*}}kernelfunc{{.*}}, ptr
@0
+// OGCG: define internal void @__cuda_register_globals(ptr %[[#OGFATBIN:]])
+// OGCG: call{{.*}}__cudaRegisterFunction(ptr %[[#OGFATBIN]],
{{.*}}kernelfunc{{.*}}
+// OGCG: call void @__cudaRegisterVar(ptr %[[#OGFATBIN]], ptr @a, {{.*}},
{{.*}}, i32 0, i64 4, i32 0, i32 0)
+// OGCG: call void @__cudaRegisterVar(ptr %[[#OGFATBIN]], ptr @b, {{.*}},
{{.*}}, i32 0, i64 4, i32 1, i32 0)
// OGCG: ret void
// OGCG: define internal void @__cuda_module_ctor
@@ -134,8 +169,10 @@ void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); }
// LLVM: load ptr, ptr @__cuda_gpubin_handle
// LLVM: call void @__cudaUnregisterFatBinary
-// LLVM: define internal void @__cuda_register_globals
-// LLVM: call{{.*}}@__cudaRegisterFunction(ptr %{{.*}}, ptr
@{{.*}}kernelfunc{{.*}}, ptr @{{.*}}, ptr @{{.*}}, i32 -1, ptr null, ptr null,
ptr null, ptr null, ptr null)
+// LLVM: define internal void @__cuda_register_globals(ptr %[[#FATBIN:]])
+// LLVM: call{{.*}}@__cudaRegisterFunction(ptr %[[#FATBIN]], ptr
@{{.*}}kernelfunc{{.*}}, ptr @{{.*}}, ptr @{{.*}}, i32 -1, ptr null, ptr null,
ptr null, ptr null, ptr null)
+// LLVM: call void @__cudaRegisterVar(ptr %[[#FATBIN]], ptr @a, ptr @.stra,
ptr @.stra, i32 0, i64 4, i32 0, i32 0)
+// LLVM: call void @__cudaRegisterVar(ptr %[[#FATBIN]], ptr @b, ptr @.strb,
ptr @.strb, i32 0, i64 4, i32 1, i32 0)
// LLVM: ret void
// LLVM: define internal void @__cuda_module_ctor
``````````
</details>
https://github.com/llvm/llvm-project/pull/199270
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits