[clang] [llvm] [Offload] Change unregister library to use `atexit` instead of destructor (PR #86830)
https://github.com/jhuber6 closed https://github.com/llvm/llvm-project/pull/86830 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [Offload] Change unregister library to use `atexit` instead of destructor (PR #86830)
https://github.com/jhuber6 updated https://github.com/llvm/llvm-project/pull/86830 >From 875ed36029851a2423c97b28bd5bf34975efb016 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Wed, 27 Mar 2024 11:43:37 -0500 Subject: [PATCH 1/2] [Offload] Change unregister library to use `atexit` instead of destructor Summary: The 'new driver' sets up the lifetime of a registered liftime using global constructors and destructors. Currently, this is put at priority 1 which isn't strictly conformant as it will conflict with system utilities. We now use 101 as this is the loweest suggested for non-system constructors and will still run before user constructors. Secondly, there were issues with the CUDA runtime when destructed with a global destructor. Because the global ones are in any order and potentially run before other things we were hitting an edge case where the OpenMP runtime was uninitialized *after* `_dl_fini` was called. This would result in us erroring when we call into a destroyed `libcuda.so` instance. using `atexit` is what CUDA / HIP use and it prevents this from happening. Most everything uses `atexit` except system utilities and because of the constructor priority it will be unregistered *after* everything else but not after `_fl_fini`. --- clang/test/Driver/linker-wrapper-image.c | 8 +-- .../Frontend/Offloading/OffloadWrapper.cpp| 70 ++- 2 files changed, 41 insertions(+), 37 deletions(-) diff --git a/clang/test/Driver/linker-wrapper-image.c b/clang/test/Driver/linker-wrapper-image.c index 75475264135224..d01445e3aed04e 100644 --- a/clang/test/Driver/linker-wrapper-image.c +++ b/clang/test/Driver/linker-wrapper-image.c @@ -26,11 +26,11 @@ // OPENMP: @.omp_offloading.device_image = internal unnamed_addr constant [[[SIZE:[0-9]+]] x i8] c"\10\FF\10\AD{{.*}}", section ".llvm.offloading", align 8 // OPENMP-NEXT: @.omp_offloading.device_images = internal unnamed_addr constant [1 x %__tgt_device_image] [%__tgt_device_image { ptr getelementptr inbounds ([[[BEGIN:[0-9]+]] x i8], ptr @.omp_offloading.device_image, i64 1, i64 0), ptr getelementptr inbounds ([[[END:[0-9]+]] x i8], ptr @.omp_offloading.device_image, i64 1, i64 0), ptr @__start_omp_offloading_entries, ptr @__stop_omp_offloading_entries }] // OPENMP-NEXT: @.omp_offloading.descriptor = internal constant %__tgt_bin_desc { i32 1, ptr @.omp_offloading.device_images, ptr @__start_omp_offloading_entries, ptr @__stop_omp_offloading_entries } -// OPENMP-NEXT: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @.omp_offloading.descriptor_reg, ptr null }] -// OPENMP-NEXT: @llvm.global_dtors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @.omp_offloading.descriptor_unreg, ptr null }] +// OPENMP-NEXT: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 101, ptr @.omp_offloading.descriptor_reg, ptr null }] // OPENMP: define internal void @.omp_offloading.descriptor_reg() section ".text.startup" { // OPENMP-NEXT: entry: +// OPENMP-NEXT: %0 = call i32 @atexit(ptr @.omp_offloading.descriptor_unreg) // OPENMP-NEXT: call void @__tgt_register_lib(ptr @.omp_offloading.descriptor) // OPENMP-NEXT: ret void // OPENMP-NEXT: } @@ -62,7 +62,7 @@ // CUDA-NEXT: @.fatbin_wrapper = internal constant %fatbin_wrapper { i32 1180844977, i32 1, ptr @.fatbin_image, ptr null }, section ".nvFatBinSegment", align 8 // CUDA-NEXT: @.cuda.binary_handle = internal global ptr null -// CUDA: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @.cuda.fatbin_reg, ptr null }] +// CUDA: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 101, ptr @.cuda.fatbin_reg, ptr null }] // CUDA: define internal void @.cuda.fatbin_reg() section ".text.startup" { // CUDA-NEXT: entry: @@ -162,7 +162,7 @@ // HIP-NEXT: @.fatbin_wrapper = internal constant %fatbin_wrapper { i32 1212764230, i32 1, ptr @.fatbin_image, ptr null }, section ".hipFatBinSegment", align 8 // HIP-NEXT: @.hip.binary_handle = internal global ptr null -// HIP: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @.hip.fatbin_reg, ptr null }] +// HIP: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 101, ptr @.hip.fatbin_reg, ptr null }] // HIP: define internal void @.hip.fatbin_reg() section ".text.startup" { // HIP-NEXT: entry: diff --git a/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp b/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp index fec1bdbe9d8c74..86e8712ce95ae6 100644 --- a/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp +++ b/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp @@ -186,57 +186,62 @@ GlobalVariable *createBinDesc(Module &M, ArrayRef> Bufs, ".omp_offloading.descriptor" + Suffix); } -void createRegisterFunction(Module &M
[clang] [llvm] [Offload] Change unregister library to use `atexit` instead of destructor (PR #86830)
@@ -186,57 +186,62 @@ GlobalVariable *createBinDesc(Module &M, ArrayRef> Bufs, ".omp_offloading.descriptor" + Suffix); } -void createRegisterFunction(Module &M, GlobalVariable *BinDesc, -StringRef Suffix) { +Function *createUnregisterFunction(Module &M, GlobalVariable *BinDesc, + StringRef Suffix) { LLVMContext &C = M.getContext(); auto *FuncTy = FunctionType::get(Type::getVoidTy(C), /*isVarArg*/ false); - auto *Func = Function::Create(FuncTy, GlobalValue::InternalLinkage, -".omp_offloading.descriptor_reg" + Suffix, &M); + auto *Func = + Function::Create(FuncTy, GlobalValue::InternalLinkage, + ".omp_offloading.descriptor_unreg" + Suffix, &M); Func->setSection(".text.startup"); - // Get __tgt_register_lib function declaration. - auto *RegFuncTy = FunctionType::get(Type::getVoidTy(C), getBinDescPtrTy(M), - /*isVarArg*/ false); - FunctionCallee RegFuncC = - M.getOrInsertFunction("__tgt_register_lib", RegFuncTy); + // Get __tgt_unregister_lib function declaration. + auto *UnRegFuncTy = FunctionType::get(Type::getVoidTy(C), getBinDescPtrTy(M), +/*isVarArg*/ false); + FunctionCallee UnRegFuncC = + M.getOrInsertFunction("__tgt_unregister_lib", UnRegFuncTy); // Construct function body IRBuilder<> Builder(BasicBlock::Create(C, "entry", Func)); - Builder.CreateCall(RegFuncC, BinDesc); + Builder.CreateCall(UnRegFuncC, BinDesc); Builder.CreateRetVoid(); - // Add this function to constructors. - // Set priority to 1 so that __tgt_register_lib is executed AFTER - // __tgt_register_requires (we want to know what requirements have been - // asked for before we load a libomptarget plugin so that by the time the - // plugin is loaded it can report how many devices there are which can - // satisfy these requirements). - appendToGlobalCtors(M, Func, /*Priority*/ 1); + return Func; } -void createUnregisterFunction(Module &M, GlobalVariable *BinDesc, - StringRef Suffix) { +void createRegisterFunction(Module &M, GlobalVariable *BinDesc, +StringRef Suffix) { LLVMContext &C = M.getContext(); auto *FuncTy = FunctionType::get(Type::getVoidTy(C), /*isVarArg*/ false); - auto *Func = - Function::Create(FuncTy, GlobalValue::InternalLinkage, - ".omp_offloading.descriptor_unreg" + Suffix, &M); + auto *Func = Function::Create(FuncTy, GlobalValue::InternalLinkage, +".omp_offloading.descriptor_reg" + Suffix, &M); Func->setSection(".text.startup"); - // Get __tgt_unregister_lib function declaration. - auto *UnRegFuncTy = FunctionType::get(Type::getVoidTy(C), getBinDescPtrTy(M), -/*isVarArg*/ false); - FunctionCallee UnRegFuncC = - M.getOrInsertFunction("__tgt_unregister_lib", UnRegFuncTy); + // Get __tgt_register_lib function declaration. + auto *RegFuncTy = FunctionType::get(Type::getVoidTy(C), getBinDescPtrTy(M), + /*isVarArg*/ false); + FunctionCallee RegFuncC = + M.getOrInsertFunction("__tgt_register_lib", RegFuncTy); + + auto *AtExitTy = FunctionType::get( + Type::getInt32Ty(C), PointerType::getUnqual(C), /*isVarArg=*/false); + FunctionCallee AtExit = M.getOrInsertFunction("atexit", AtExitTy); + + Function *UnregFunc = createUnregisterFunction(M, BinDesc, Suffix); // Construct function body IRBuilder<> Builder(BasicBlock::Create(C, "entry", Func)); - Builder.CreateCall(UnRegFuncC, BinDesc); + + // Register the destructors with 'atexit', This is expected by the CUDA jhuber6 wrote: ```suggestion // Register the destructors with 'atexit'. This is expected by the CUDA ``` https://github.com/llvm/llvm-project/pull/86830 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [Offload] Change unregister library to use `atexit` instead of destructor (PR #86830)
@@ -186,57 +186,62 @@ GlobalVariable *createBinDesc(Module &M, ArrayRef> Bufs, ".omp_offloading.descriptor" + Suffix); } -void createRegisterFunction(Module &M, GlobalVariable *BinDesc, -StringRef Suffix) { +Function *createUnregisterFunction(Module &M, GlobalVariable *BinDesc, + StringRef Suffix) { LLVMContext &C = M.getContext(); auto *FuncTy = FunctionType::get(Type::getVoidTy(C), /*isVarArg*/ false); - auto *Func = Function::Create(FuncTy, GlobalValue::InternalLinkage, -".omp_offloading.descriptor_reg" + Suffix, &M); + auto *Func = + Function::Create(FuncTy, GlobalValue::InternalLinkage, + ".omp_offloading.descriptor_unreg" + Suffix, &M); Func->setSection(".text.startup"); - // Get __tgt_register_lib function declaration. - auto *RegFuncTy = FunctionType::get(Type::getVoidTy(C), getBinDescPtrTy(M), - /*isVarArg*/ false); - FunctionCallee RegFuncC = - M.getOrInsertFunction("__tgt_register_lib", RegFuncTy); + // Get __tgt_unregister_lib function declaration. + auto *UnRegFuncTy = FunctionType::get(Type::getVoidTy(C), getBinDescPtrTy(M), +/*isVarArg*/ false); + FunctionCallee UnRegFuncC = + M.getOrInsertFunction("__tgt_unregister_lib", UnRegFuncTy); // Construct function body IRBuilder<> Builder(BasicBlock::Create(C, "entry", Func)); - Builder.CreateCall(RegFuncC, BinDesc); + Builder.CreateCall(UnRegFuncC, BinDesc); Builder.CreateRetVoid(); - // Add this function to constructors. - // Set priority to 1 so that __tgt_register_lib is executed AFTER - // __tgt_register_requires (we want to know what requirements have been - // asked for before we load a libomptarget plugin so that by the time the - // plugin is loaded it can report how many devices there are which can - // satisfy these requirements). - appendToGlobalCtors(M, Func, /*Priority*/ 1); + return Func; } -void createUnregisterFunction(Module &M, GlobalVariable *BinDesc, - StringRef Suffix) { +void createRegisterFunction(Module &M, GlobalVariable *BinDesc, +StringRef Suffix) { LLVMContext &C = M.getContext(); auto *FuncTy = FunctionType::get(Type::getVoidTy(C), /*isVarArg*/ false); - auto *Func = - Function::Create(FuncTy, GlobalValue::InternalLinkage, - ".omp_offloading.descriptor_unreg" + Suffix, &M); + auto *Func = Function::Create(FuncTy, GlobalValue::InternalLinkage, +".omp_offloading.descriptor_reg" + Suffix, &M); Func->setSection(".text.startup"); - // Get __tgt_unregister_lib function declaration. - auto *UnRegFuncTy = FunctionType::get(Type::getVoidTy(C), getBinDescPtrTy(M), -/*isVarArg*/ false); - FunctionCallee UnRegFuncC = - M.getOrInsertFunction("__tgt_unregister_lib", UnRegFuncTy); + // Get __tgt_register_lib function declaration. + auto *RegFuncTy = FunctionType::get(Type::getVoidTy(C), getBinDescPtrTy(M), + /*isVarArg*/ false); + FunctionCallee RegFuncC = + M.getOrInsertFunction("__tgt_register_lib", RegFuncTy); + + auto *AtExitTy = FunctionType::get( + Type::getInt32Ty(C), PointerType::getUnqual(C), /*isVarArg=*/false); + FunctionCallee AtExit = M.getOrInsertFunction("atexit", AtExitTy); + + Function *UnregFunc = createUnregisterFunction(M, BinDesc, Suffix); // Construct function body IRBuilder<> Builder(BasicBlock::Create(C, "entry", Func)); - Builder.CreateCall(UnRegFuncC, BinDesc); + + // Register the destructors with 'atexit', This is expected by the CUDA jhuber6 wrote: I think that's actually in this file somewhere for the CUDA wrapper portion. https://github.com/llvm/llvm-project/pull/86830 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [Offload] Change unregister library to use `atexit` instead of destructor (PR #86830)
https://github.com/Artem-B edited https://github.com/llvm/llvm-project/pull/86830 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [Offload] Change unregister library to use `atexit` instead of destructor (PR #86830)
@@ -186,57 +186,62 @@ GlobalVariable *createBinDesc(Module &M, ArrayRef> Bufs, ".omp_offloading.descriptor" + Suffix); } -void createRegisterFunction(Module &M, GlobalVariable *BinDesc, -StringRef Suffix) { +Function *createUnregisterFunction(Module &M, GlobalVariable *BinDesc, + StringRef Suffix) { LLVMContext &C = M.getContext(); auto *FuncTy = FunctionType::get(Type::getVoidTy(C), /*isVarArg*/ false); - auto *Func = Function::Create(FuncTy, GlobalValue::InternalLinkage, -".omp_offloading.descriptor_reg" + Suffix, &M); + auto *Func = + Function::Create(FuncTy, GlobalValue::InternalLinkage, + ".omp_offloading.descriptor_unreg" + Suffix, &M); Func->setSection(".text.startup"); - // Get __tgt_register_lib function declaration. - auto *RegFuncTy = FunctionType::get(Type::getVoidTy(C), getBinDescPtrTy(M), - /*isVarArg*/ false); - FunctionCallee RegFuncC = - M.getOrInsertFunction("__tgt_register_lib", RegFuncTy); + // Get __tgt_unregister_lib function declaration. + auto *UnRegFuncTy = FunctionType::get(Type::getVoidTy(C), getBinDescPtrTy(M), +/*isVarArg*/ false); + FunctionCallee UnRegFuncC = + M.getOrInsertFunction("__tgt_unregister_lib", UnRegFuncTy); // Construct function body IRBuilder<> Builder(BasicBlock::Create(C, "entry", Func)); - Builder.CreateCall(RegFuncC, BinDesc); + Builder.CreateCall(UnRegFuncC, BinDesc); Builder.CreateRetVoid(); - // Add this function to constructors. - // Set priority to 1 so that __tgt_register_lib is executed AFTER - // __tgt_register_requires (we want to know what requirements have been - // asked for before we load a libomptarget plugin so that by the time the - // plugin is loaded it can report how many devices there are which can - // satisfy these requirements). - appendToGlobalCtors(M, Func, /*Priority*/ 1); + return Func; } -void createUnregisterFunction(Module &M, GlobalVariable *BinDesc, - StringRef Suffix) { +void createRegisterFunction(Module &M, GlobalVariable *BinDesc, +StringRef Suffix) { LLVMContext &C = M.getContext(); auto *FuncTy = FunctionType::get(Type::getVoidTy(C), /*isVarArg*/ false); - auto *Func = - Function::Create(FuncTy, GlobalValue::InternalLinkage, - ".omp_offloading.descriptor_unreg" + Suffix, &M); + auto *Func = Function::Create(FuncTy, GlobalValue::InternalLinkage, +".omp_offloading.descriptor_reg" + Suffix, &M); Func->setSection(".text.startup"); - // Get __tgt_unregister_lib function declaration. - auto *UnRegFuncTy = FunctionType::get(Type::getVoidTy(C), getBinDescPtrTy(M), -/*isVarArg*/ false); - FunctionCallee UnRegFuncC = - M.getOrInsertFunction("__tgt_unregister_lib", UnRegFuncTy); + // Get __tgt_register_lib function declaration. + auto *RegFuncTy = FunctionType::get(Type::getVoidTy(C), getBinDescPtrTy(M), + /*isVarArg*/ false); + FunctionCallee RegFuncC = + M.getOrInsertFunction("__tgt_register_lib", RegFuncTy); + + auto *AtExitTy = FunctionType::get( + Type::getInt32Ty(C), PointerType::getUnqual(C), /*isVarArg=*/false); + FunctionCallee AtExit = M.getOrInsertFunction("atexit", AtExitTy); + + Function *UnregFunc = createUnregisterFunction(M, BinDesc, Suffix); // Construct function body IRBuilder<> Builder(BasicBlock::Create(C, "entry", Func)); - Builder.CreateCall(UnRegFuncC, BinDesc); + + // Register the destructors with 'atexit', This is expected by the CUDA Artem-B wrote: Typo. `,` -> `.` https://github.com/llvm/llvm-project/pull/86830 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [Offload] Change unregister library to use `atexit` instead of destructor (PR #86830)
@@ -186,57 +186,62 @@ GlobalVariable *createBinDesc(Module &M, ArrayRef> Bufs, ".omp_offloading.descriptor" + Suffix); } -void createRegisterFunction(Module &M, GlobalVariable *BinDesc, -StringRef Suffix) { +Function *createUnregisterFunction(Module &M, GlobalVariable *BinDesc, + StringRef Suffix) { LLVMContext &C = M.getContext(); auto *FuncTy = FunctionType::get(Type::getVoidTy(C), /*isVarArg*/ false); - auto *Func = Function::Create(FuncTy, GlobalValue::InternalLinkage, -".omp_offloading.descriptor_reg" + Suffix, &M); + auto *Func = + Function::Create(FuncTy, GlobalValue::InternalLinkage, + ".omp_offloading.descriptor_unreg" + Suffix, &M); Func->setSection(".text.startup"); - // Get __tgt_register_lib function declaration. - auto *RegFuncTy = FunctionType::get(Type::getVoidTy(C), getBinDescPtrTy(M), - /*isVarArg*/ false); - FunctionCallee RegFuncC = - M.getOrInsertFunction("__tgt_register_lib", RegFuncTy); + // Get __tgt_unregister_lib function declaration. + auto *UnRegFuncTy = FunctionType::get(Type::getVoidTy(C), getBinDescPtrTy(M), +/*isVarArg*/ false); + FunctionCallee UnRegFuncC = + M.getOrInsertFunction("__tgt_unregister_lib", UnRegFuncTy); // Construct function body IRBuilder<> Builder(BasicBlock::Create(C, "entry", Func)); - Builder.CreateCall(RegFuncC, BinDesc); + Builder.CreateCall(UnRegFuncC, BinDesc); Builder.CreateRetVoid(); - // Add this function to constructors. - // Set priority to 1 so that __tgt_register_lib is executed AFTER - // __tgt_register_requires (we want to know what requirements have been - // asked for before we load a libomptarget plugin so that by the time the - // plugin is loaded it can report how many devices there are which can - // satisfy these requirements). - appendToGlobalCtors(M, Func, /*Priority*/ 1); + return Func; } -void createUnregisterFunction(Module &M, GlobalVariable *BinDesc, - StringRef Suffix) { +void createRegisterFunction(Module &M, GlobalVariable *BinDesc, +StringRef Suffix) { LLVMContext &C = M.getContext(); auto *FuncTy = FunctionType::get(Type::getVoidTy(C), /*isVarArg*/ false); - auto *Func = - Function::Create(FuncTy, GlobalValue::InternalLinkage, - ".omp_offloading.descriptor_unreg" + Suffix, &M); + auto *Func = Function::Create(FuncTy, GlobalValue::InternalLinkage, +".omp_offloading.descriptor_reg" + Suffix, &M); Func->setSection(".text.startup"); - // Get __tgt_unregister_lib function declaration. - auto *UnRegFuncTy = FunctionType::get(Type::getVoidTy(C), getBinDescPtrTy(M), -/*isVarArg*/ false); - FunctionCallee UnRegFuncC = - M.getOrInsertFunction("__tgt_unregister_lib", UnRegFuncTy); + // Get __tgt_register_lib function declaration. + auto *RegFuncTy = FunctionType::get(Type::getVoidTy(C), getBinDescPtrTy(M), + /*isVarArg*/ false); + FunctionCallee RegFuncC = + M.getOrInsertFunction("__tgt_register_lib", RegFuncTy); + + auto *AtExitTy = FunctionType::get( + Type::getInt32Ty(C), PointerType::getUnqual(C), /*isVarArg=*/false); + FunctionCallee AtExit = M.getOrInsertFunction("atexit", AtExitTy); + + Function *UnregFunc = createUnregisterFunction(M, BinDesc, Suffix); // Construct function body IRBuilder<> Builder(BasicBlock::Create(C, "entry", Func)); - Builder.CreateCall(UnRegFuncC, BinDesc); + + // Register the destructors with 'atexit', This is expected by the CUDA Artem-B wrote: > This is expected by the CUDA runtime I'd add a reference to clang/lib/CodeGen/CGCUDANV.cpp which provides some history why we switched to `atexit`. https://github.com/llvm/llvm-project/pull/86830 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [Offload] Change unregister library to use `atexit` instead of destructor (PR #86830)
https://github.com/Artem-B approved this pull request. https://github.com/llvm/llvm-project/pull/86830 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [Offload] Change unregister library to use `atexit` instead of destructor (PR #86830)
jhuber6 wrote: Fixed, I neglected the fact that OpenMP registers more destructors inside of the constructor itself. Passes all the tests now. https://github.com/llvm/llvm-project/pull/86830 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [Offload] Change unregister library to use `atexit` instead of destructor (PR #86830)
https://github.com/jhuber6 updated https://github.com/llvm/llvm-project/pull/86830 >From 875ed36029851a2423c97b28bd5bf34975efb016 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Wed, 27 Mar 2024 11:43:37 -0500 Subject: [PATCH] [Offload] Change unregister library to use `atexit` instead of destructor Summary: The 'new driver' sets up the lifetime of a registered liftime using global constructors and destructors. Currently, this is put at priority 1 which isn't strictly conformant as it will conflict with system utilities. We now use 101 as this is the loweest suggested for non-system constructors and will still run before user constructors. Secondly, there were issues with the CUDA runtime when destructed with a global destructor. Because the global ones are in any order and potentially run before other things we were hitting an edge case where the OpenMP runtime was uninitialized *after* `_dl_fini` was called. This would result in us erroring when we call into a destroyed `libcuda.so` instance. using `atexit` is what CUDA / HIP use and it prevents this from happening. Most everything uses `atexit` except system utilities and because of the constructor priority it will be unregistered *after* everything else but not after `_fl_fini`. --- clang/test/Driver/linker-wrapper-image.c | 8 +-- .../Frontend/Offloading/OffloadWrapper.cpp| 70 ++- 2 files changed, 41 insertions(+), 37 deletions(-) diff --git a/clang/test/Driver/linker-wrapper-image.c b/clang/test/Driver/linker-wrapper-image.c index 75475264135224..d01445e3aed04e 100644 --- a/clang/test/Driver/linker-wrapper-image.c +++ b/clang/test/Driver/linker-wrapper-image.c @@ -26,11 +26,11 @@ // OPENMP: @.omp_offloading.device_image = internal unnamed_addr constant [[[SIZE:[0-9]+]] x i8] c"\10\FF\10\AD{{.*}}", section ".llvm.offloading", align 8 // OPENMP-NEXT: @.omp_offloading.device_images = internal unnamed_addr constant [1 x %__tgt_device_image] [%__tgt_device_image { ptr getelementptr inbounds ([[[BEGIN:[0-9]+]] x i8], ptr @.omp_offloading.device_image, i64 1, i64 0), ptr getelementptr inbounds ([[[END:[0-9]+]] x i8], ptr @.omp_offloading.device_image, i64 1, i64 0), ptr @__start_omp_offloading_entries, ptr @__stop_omp_offloading_entries }] // OPENMP-NEXT: @.omp_offloading.descriptor = internal constant %__tgt_bin_desc { i32 1, ptr @.omp_offloading.device_images, ptr @__start_omp_offloading_entries, ptr @__stop_omp_offloading_entries } -// OPENMP-NEXT: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @.omp_offloading.descriptor_reg, ptr null }] -// OPENMP-NEXT: @llvm.global_dtors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @.omp_offloading.descriptor_unreg, ptr null }] +// OPENMP-NEXT: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 101, ptr @.omp_offloading.descriptor_reg, ptr null }] // OPENMP: define internal void @.omp_offloading.descriptor_reg() section ".text.startup" { // OPENMP-NEXT: entry: +// OPENMP-NEXT: %0 = call i32 @atexit(ptr @.omp_offloading.descriptor_unreg) // OPENMP-NEXT: call void @__tgt_register_lib(ptr @.omp_offloading.descriptor) // OPENMP-NEXT: ret void // OPENMP-NEXT: } @@ -62,7 +62,7 @@ // CUDA-NEXT: @.fatbin_wrapper = internal constant %fatbin_wrapper { i32 1180844977, i32 1, ptr @.fatbin_image, ptr null }, section ".nvFatBinSegment", align 8 // CUDA-NEXT: @.cuda.binary_handle = internal global ptr null -// CUDA: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @.cuda.fatbin_reg, ptr null }] +// CUDA: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 101, ptr @.cuda.fatbin_reg, ptr null }] // CUDA: define internal void @.cuda.fatbin_reg() section ".text.startup" { // CUDA-NEXT: entry: @@ -162,7 +162,7 @@ // HIP-NEXT: @.fatbin_wrapper = internal constant %fatbin_wrapper { i32 1212764230, i32 1, ptr @.fatbin_image, ptr null }, section ".hipFatBinSegment", align 8 // HIP-NEXT: @.hip.binary_handle = internal global ptr null -// HIP: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @.hip.fatbin_reg, ptr null }] +// HIP: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 101, ptr @.hip.fatbin_reg, ptr null }] // HIP: define internal void @.hip.fatbin_reg() section ".text.startup" { // HIP-NEXT: entry: diff --git a/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp b/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp index fec1bdbe9d8c74..86e8712ce95ae6 100644 --- a/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp +++ b/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp @@ -186,57 +186,62 @@ GlobalVariable *createBinDesc(Module &M, ArrayRef> Bufs, ".omp_offloading.descriptor" + Suffix); } -void createRegisterFunction(Module &M, Gl
[clang] [llvm] [Offload] Change unregister library to use `atexit` instead of destructor (PR #86830)
jhuber6 wrote: So, looking into `libomptarget` when this is applied is doing something weird. `atexit` is functionally a stack, so that means the first in is the last out. However, it seems that the static global constructor created inside of the CUDA plugin is being unregistered *after* this one for unknown reasons. Will need to look into that. https://github.com/llvm/llvm-project/pull/86830 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [Offload] Change unregister library to use `atexit` instead of destructor (PR #86830)
@@ -186,57 +186,60 @@ GlobalVariable *createBinDesc(Module &M, ArrayRef> Bufs, ".omp_offloading.descriptor" + Suffix); } -void createRegisterFunction(Module &M, GlobalVariable *BinDesc, -StringRef Suffix) { +Function *createUnregisterFunction(Module &M, GlobalVariable *BinDesc, jhuber6 wrote: No, since I need to call this function from `createRegisterFunction` now. I could forward declare it but I don't think there's a point given it's inside an anonymous namespace. https://github.com/llvm/llvm-project/pull/86830 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [Offload] Change unregister library to use `atexit` instead of destructor (PR #86830)
@@ -186,57 +186,60 @@ GlobalVariable *createBinDesc(Module &M, ArrayRef> Bufs, ".omp_offloading.descriptor" + Suffix); } -void createRegisterFunction(Module &M, GlobalVariable *BinDesc, -StringRef Suffix) { +Function *createUnregisterFunction(Module &M, GlobalVariable *BinDesc, yxsamliu wrote: It seems the order of createRegisterFunction and createUnregisterFunction is swapped. This causes some artificial differences. Is it OK to keep their original order. https://github.com/llvm/llvm-project/pull/86830 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [Offload] Change unregister library to use `atexit` instead of destructor (PR #86830)
llvmbot wrote: @llvm/pr-subscribers-clang Author: Joseph Huber (jhuber6) Changes Summary: The 'new driver' sets up the lifetime of a registered liftime using global constructors and destructors. Currently, this is put at priority 1 which isn't strictly conformant as it will conflict with system utilities. We now use 101 as this is the loweest suggested for non-system constructors and will still run before user constructors. Secondly, there were issues with the CUDA runtime when destructed with a global destructor. Because the global ones are in any order and potentially run before other things we were hitting an edge case where the OpenMP runtime was uninitialized *after* `_dl_fini` was called. This would result in us erroring when we call into a destroyed `libcuda.so` instance. using `atexit` is what CUDA / HIP use and it prevents this from happening. Most everything uses `atexit` except system utilities and because of the constructor priority it will be unregistered *after* everything else but not after `_fl_fini`. --- Full diff: https://github.com/llvm/llvm-project/pull/86830.diff 2 Files Affected: - (modified) clang/test/Driver/linker-wrapper-image.c (+4-4) - (modified) llvm/lib/Frontend/Offloading/OffloadWrapper.cpp (+35-33) ``diff diff --git a/clang/test/Driver/linker-wrapper-image.c b/clang/test/Driver/linker-wrapper-image.c index 75475264135224..5d5d62805e174d 100644 --- a/clang/test/Driver/linker-wrapper-image.c +++ b/clang/test/Driver/linker-wrapper-image.c @@ -26,12 +26,12 @@ // OPENMP: @.omp_offloading.device_image = internal unnamed_addr constant [[[SIZE:[0-9]+]] x i8] c"\10\FF\10\AD{{.*}}", section ".llvm.offloading", align 8 // OPENMP-NEXT: @.omp_offloading.device_images = internal unnamed_addr constant [1 x %__tgt_device_image] [%__tgt_device_image { ptr getelementptr inbounds ([[[BEGIN:[0-9]+]] x i8], ptr @.omp_offloading.device_image, i64 1, i64 0), ptr getelementptr inbounds ([[[END:[0-9]+]] x i8], ptr @.omp_offloading.device_image, i64 1, i64 0), ptr @__start_omp_offloading_entries, ptr @__stop_omp_offloading_entries }] // OPENMP-NEXT: @.omp_offloading.descriptor = internal constant %__tgt_bin_desc { i32 1, ptr @.omp_offloading.device_images, ptr @__start_omp_offloading_entries, ptr @__stop_omp_offloading_entries } -// OPENMP-NEXT: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @.omp_offloading.descriptor_reg, ptr null }] -// OPENMP-NEXT: @llvm.global_dtors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @.omp_offloading.descriptor_unreg, ptr null }] +// OPENMP-NEXT: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 101, ptr @.omp_offloading.descriptor_reg, ptr null }] // OPENMP: define internal void @.omp_offloading.descriptor_reg() section ".text.startup" { // OPENMP-NEXT: entry: // OPENMP-NEXT: call void @__tgt_register_lib(ptr @.omp_offloading.descriptor) +// OPENMP-NEXT: %0 = call i32 @atexit(ptr @.omp_offloading.descriptor_unreg) // OPENMP-NEXT: ret void // OPENMP-NEXT: } @@ -62,7 +62,7 @@ // CUDA-NEXT: @.fatbin_wrapper = internal constant %fatbin_wrapper { i32 1180844977, i32 1, ptr @.fatbin_image, ptr null }, section ".nvFatBinSegment", align 8 // CUDA-NEXT: @.cuda.binary_handle = internal global ptr null -// CUDA: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @.cuda.fatbin_reg, ptr null }] +// CUDA: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 101, ptr @.cuda.fatbin_reg, ptr null }] // CUDA: define internal void @.cuda.fatbin_reg() section ".text.startup" { // CUDA-NEXT: entry: @@ -162,7 +162,7 @@ // HIP-NEXT: @.fatbin_wrapper = internal constant %fatbin_wrapper { i32 1212764230, i32 1, ptr @.fatbin_image, ptr null }, section ".hipFatBinSegment", align 8 // HIP-NEXT: @.hip.binary_handle = internal global ptr null -// HIP: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @.hip.fatbin_reg, ptr null }] +// HIP: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 101, ptr @.hip.fatbin_reg, ptr null }] // HIP: define internal void @.hip.fatbin_reg() section ".text.startup" { // HIP-NEXT: entry: diff --git a/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp b/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp index fec1bdbe9d8c74..4f9494d2143fce 100644 --- a/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp +++ b/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp @@ -186,57 +186,60 @@ GlobalVariable *createBinDesc(Module &M, ArrayRef> Bufs, ".omp_offloading.descriptor" + Suffix); } -void createRegisterFunction(Module &M, GlobalVariable *BinDesc, -StringRef Suffix) { +Function *createUnregisterFunction(Module &M, GlobalVariable *BinDesc, +
[clang] [llvm] [Offload] Change unregister library to use `atexit` instead of destructor (PR #86830)
https://github.com/jhuber6 created https://github.com/llvm/llvm-project/pull/86830 Summary: The 'new driver' sets up the lifetime of a registered liftime using global constructors and destructors. Currently, this is put at priority 1 which isn't strictly conformant as it will conflict with system utilities. We now use 101 as this is the loweest suggested for non-system constructors and will still run before user constructors. Secondly, there were issues with the CUDA runtime when destructed with a global destructor. Because the global ones are in any order and potentially run before other things we were hitting an edge case where the OpenMP runtime was uninitialized *after* `_dl_fini` was called. This would result in us erroring when we call into a destroyed `libcuda.so` instance. using `atexit` is what CUDA / HIP use and it prevents this from happening. Most everything uses `atexit` except system utilities and because of the constructor priority it will be unregistered *after* everything else but not after `_fl_fini`. >From 1583db25c7a24e88fc3439820538ff2ef8e24429 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Wed, 27 Mar 2024 11:43:37 -0500 Subject: [PATCH] [Offload] Change unregister library to use `atexit` instead of destructor Summary: The 'new driver' sets up the lifetime of a registered liftime using global constructors and destructors. Currently, this is put at priority 1 which isn't strictly conformant as it will conflict with system utilities. We now use 101 as this is the loweest suggested for non-system constructors and will still run before user constructors. Secondly, there were issues with the CUDA runtime when destructed with a global destructor. Because the global ones are in any order and potentially run before other things we were hitting an edge case where the OpenMP runtime was uninitialized *after* `_dl_fini` was called. This would result in us erroring when we call into a destroyed `libcuda.so` instance. using `atexit` is what CUDA / HIP use and it prevents this from happening. Most everything uses `atexit` except system utilities and because of the constructor priority it will be unregistered *after* everything else but not after `_fl_fini`. --- clang/test/Driver/linker-wrapper-image.c | 8 +-- .../Frontend/Offloading/OffloadWrapper.cpp| 68 ++- 2 files changed, 39 insertions(+), 37 deletions(-) diff --git a/clang/test/Driver/linker-wrapper-image.c b/clang/test/Driver/linker-wrapper-image.c index 75475264135224..5d5d62805e174d 100644 --- a/clang/test/Driver/linker-wrapper-image.c +++ b/clang/test/Driver/linker-wrapper-image.c @@ -26,12 +26,12 @@ // OPENMP: @.omp_offloading.device_image = internal unnamed_addr constant [[[SIZE:[0-9]+]] x i8] c"\10\FF\10\AD{{.*}}", section ".llvm.offloading", align 8 // OPENMP-NEXT: @.omp_offloading.device_images = internal unnamed_addr constant [1 x %__tgt_device_image] [%__tgt_device_image { ptr getelementptr inbounds ([[[BEGIN:[0-9]+]] x i8], ptr @.omp_offloading.device_image, i64 1, i64 0), ptr getelementptr inbounds ([[[END:[0-9]+]] x i8], ptr @.omp_offloading.device_image, i64 1, i64 0), ptr @__start_omp_offloading_entries, ptr @__stop_omp_offloading_entries }] // OPENMP-NEXT: @.omp_offloading.descriptor = internal constant %__tgt_bin_desc { i32 1, ptr @.omp_offloading.device_images, ptr @__start_omp_offloading_entries, ptr @__stop_omp_offloading_entries } -// OPENMP-NEXT: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @.omp_offloading.descriptor_reg, ptr null }] -// OPENMP-NEXT: @llvm.global_dtors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @.omp_offloading.descriptor_unreg, ptr null }] +// OPENMP-NEXT: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 101, ptr @.omp_offloading.descriptor_reg, ptr null }] // OPENMP: define internal void @.omp_offloading.descriptor_reg() section ".text.startup" { // OPENMP-NEXT: entry: // OPENMP-NEXT: call void @__tgt_register_lib(ptr @.omp_offloading.descriptor) +// OPENMP-NEXT: %0 = call i32 @atexit(ptr @.omp_offloading.descriptor_unreg) // OPENMP-NEXT: ret void // OPENMP-NEXT: } @@ -62,7 +62,7 @@ // CUDA-NEXT: @.fatbin_wrapper = internal constant %fatbin_wrapper { i32 1180844977, i32 1, ptr @.fatbin_image, ptr null }, section ".nvFatBinSegment", align 8 // CUDA-NEXT: @.cuda.binary_handle = internal global ptr null -// CUDA: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @.cuda.fatbin_reg, ptr null }] +// CUDA: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 101, ptr @.cuda.fatbin_reg, ptr null }] // CUDA: define internal void @.cuda.fatbin_reg() section ".text.startup" { // CUDA-NEXT: entry: @@ -162,7 +162,7 @@ // HIP-NEXT: @.fatbin_wrapper = internal constant %fatbin_wrapper { i32 1212764230, i32 1, ptr @.fat