[llvm] [clang] [openmp] [OpenMP] Rework handling of global ctor/dtors in OpenMP (PR #71739)
https://github.com/jhuber6 closed https://github.com/llvm/llvm-project/pull/71739 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [openmp] [OpenMP] Rework handling of global ctor/dtors in OpenMP (PR #71739)
@@ -1038,6 +1048,109 @@ struct CUDADeviceTy : public GenericDeviceTy { using CUDAStreamManagerTy = GenericDeviceResourceManagerTy; using CUDAEventManagerTy = GenericDeviceResourceManagerTy; + Error callGlobalCtorDtorCommon(GenericPluginTy &Plugin, DeviceImageTy &Image, + bool IsCtor) { +// Perform a quick check for the named kernel in the image. The kernel +// should be created by the 'nvptx-lower-ctor-dtor' pass. +GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler(); +GlobalTy Global(IsCtor ? "nvptx$device$init" : "nvptx$device$fini", +sizeof(void *)); +if (auto Err = Handler.getGlobalMetadataFromImage(*this, Image, Global)) { + consumeError(std::move(Err)); + return Plugin::success(); +} + +// The Nvidia backend cannot handle creating the ctor / dtor array +// automatically so we must create it ourselves. The backend will emit +// several globals that contain function pointers we can call. These are +// prefixed with a known name due to Nvidia's lack of section support. +const ELF64LEObjectFile *ELFObj = +Handler.getOrCreateELFObjectFile(*this, Image); +if (!ELFObj) + return Plugin::error("Unable to create ELF object for image %p", + Image.getStart()); + +// Search for all symbols that contain a constructor or destructor. +SmallVector> Funcs; +for (ELFSymbolRef Sym : ELFObj->symbols()) { + auto NameOrErr = Sym.getName(); + if (!NameOrErr) +return NameOrErr.takeError(); + + if (!NameOrErr->starts_with(IsCtor ? "__init_array_object_" + : "__fini_array_object_")) +continue; + + uint16_t priority; + if (NameOrErr->rsplit('_').second.getAsInteger(10, priority)) +return Plugin::error("Invalid priority for constructor or destructor"); + + Funcs.emplace_back(*NameOrErr, priority); +} + +// Sort the created array to be in priority order. +llvm::sort(Funcs, [=](auto x, auto y) { return x.second < y.second; }); + +// Allocate a buffer to store all of the known constructor / destructor +// functions in so we can iterate them on the device. +void *Buffer = +allocate(Funcs.size() * sizeof(void *), nullptr, TARGET_ALLOC_SHARED); jdoerfert wrote: I'm more worried about systems that do not have support than about the time. If you think it's always supported, we can keep it for now. https://github.com/llvm/llvm-project/pull/71739 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [openmp] [OpenMP] Rework handling of global ctor/dtors in OpenMP (PR #71739)
@@ -1038,6 +1048,109 @@ struct CUDADeviceTy : public GenericDeviceTy { using CUDAStreamManagerTy = GenericDeviceResourceManagerTy; using CUDAEventManagerTy = GenericDeviceResourceManagerTy; + Error callGlobalCtorDtorCommon(GenericPluginTy &Plugin, DeviceImageTy &Image, + bool IsCtor) { +// Perform a quick check for the named kernel in the image. The kernel +// should be created by the 'nvptx-lower-ctor-dtor' pass. +GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler(); +GlobalTy Global(IsCtor ? "nvptx$device$init" : "nvptx$device$fini", +sizeof(void *)); +if (auto Err = Handler.getGlobalMetadataFromImage(*this, Image, Global)) { + consumeError(std::move(Err)); + return Plugin::success(); +} + +// The Nvidia backend cannot handle creating the ctor / dtor array +// automatically so we must create it ourselves. The backend will emit +// several globals that contain function pointers we can call. These are +// prefixed with a known name due to Nvidia's lack of section support. +const ELF64LEObjectFile *ELFObj = +Handler.getOrCreateELFObjectFile(*this, Image); +if (!ELFObj) + return Plugin::error("Unable to create ELF object for image %p", + Image.getStart()); + +// Search for all symbols that contain a constructor or destructor. +SmallVector> Funcs; +for (ELFSymbolRef Sym : ELFObj->symbols()) { + auto NameOrErr = Sym.getName(); + if (!NameOrErr) +return NameOrErr.takeError(); + + if (!NameOrErr->starts_with(IsCtor ? "__init_array_object_" + : "__fini_array_object_")) +continue; + + uint16_t priority; + if (NameOrErr->rsplit('_').second.getAsInteger(10, priority)) +return Plugin::error("Invalid priority for constructor or destructor"); + + Funcs.emplace_back(*NameOrErr, priority); +} + +// Sort the created array to be in priority order. +llvm::sort(Funcs, [=](auto x, auto y) { return x.second < y.second; }); + +// Allocate a buffer to store all of the known constructor / destructor +// functions in so we can iterate them on the device. +void *Buffer = +allocate(Funcs.size() * sizeof(void *), nullptr, TARGET_ALLOC_SHARED); +if (!Buffer) + return Plugin::error("Failed to allocate memory for global buffer"); + +auto *GlobalPtrStart = reinterpret_cast(Buffer); +auto *GlobalPtrStop = reinterpret_cast(Buffer) + Funcs.size(); + +std::size_t Idx = 0; +for (auto [Name, Priority] : Funcs) { + GlobalTy FunctionAddr(Name.str(), sizeof(void *), &GlobalPtrStart[Idx++]); + if (auto Err = Handler.readGlobalFromDevice(*this, Image, FunctionAddr)) +return std::move(Err); +} + +// Copy the created buffer to the appropriate symbols so the kernel can +// iterate through them. +GlobalTy StartGlobal(IsCtor ? "__init_array_start" : "__fini_array_start", + sizeof(void *), &GlobalPtrStart); +if (auto Err = Handler.writeGlobalToDevice(*this, Image, StartGlobal)) + return std::move(Err); + +GlobalTy StopGlobal(IsCtor ? "__init_array_end" : "__fini_array_end", +sizeof(void *), &GlobalPtrStop); +if (auto Err = Handler.writeGlobalToDevice(*this, Image, StopGlobal)) + return std::move(Err); + +// Launch the kernel to execute the functions in the buffer. +GenericKernelTy *CUDAKernel = Plugin.allocate(); +if (!CUDAKernel) + return Plugin::error("Failed to allocate memory for CUDA kernel"); + +new (CUDAKernel) +CUDAKernelTy(IsCtor ? "nvptx$device$init" : "nvptx$device$fini"); + +if (auto Err = CUDAKernel->init(*this, Image)) + return std::move(Err); + +AsyncInfoWrapperTy AsyncInfoWrapper(*this, nullptr); + +if (auto Err = initAsyncInfoImpl(AsyncInfoWrapper)) jdoerfert wrote: You shouldn't need this. https://github.com/llvm/llvm-project/pull/71739 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [openmp] [OpenMP] Rework handling of global ctor/dtors in OpenMP (PR #71739)
@@ -1038,6 +1048,109 @@ struct CUDADeviceTy : public GenericDeviceTy { using CUDAStreamManagerTy = GenericDeviceResourceManagerTy; using CUDAEventManagerTy = GenericDeviceResourceManagerTy; + Error callGlobalCtorDtorCommon(GenericPluginTy &Plugin, DeviceImageTy &Image, + bool IsCtor) { +// Perform a quick check for the named kernel in the image. The kernel +// should be created by the 'nvptx-lower-ctor-dtor' pass. +GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler(); +GlobalTy Global(IsCtor ? "nvptx$device$init" : "nvptx$device$fini", +sizeof(void *)); +if (auto Err = Handler.getGlobalMetadataFromImage(*this, Image, Global)) { + consumeError(std::move(Err)); + return Plugin::success(); +} + +// The Nvidia backend cannot handle creating the ctor / dtor array +// automatically so we must create it ourselves. The backend will emit +// several globals that contain function pointers we can call. These are +// prefixed with a known name due to Nvidia's lack of section support. +const ELF64LEObjectFile *ELFObj = +Handler.getOrCreateELFObjectFile(*this, Image); +if (!ELFObj) + return Plugin::error("Unable to create ELF object for image %p", + Image.getStart()); + +// Search for all symbols that contain a constructor or destructor. +SmallVector> Funcs; +for (ELFSymbolRef Sym : ELFObj->symbols()) { + auto NameOrErr = Sym.getName(); + if (!NameOrErr) +return NameOrErr.takeError(); + + if (!NameOrErr->starts_with(IsCtor ? "__init_array_object_" + : "__fini_array_object_")) +continue; + + uint16_t priority; + if (NameOrErr->rsplit('_').second.getAsInteger(10, priority)) +return Plugin::error("Invalid priority for constructor or destructor"); + + Funcs.emplace_back(*NameOrErr, priority); +} + +// Sort the created array to be in priority order. +llvm::sort(Funcs, [=](auto x, auto y) { return x.second < y.second; }); + +// Allocate a buffer to store all of the known constructor / destructor +// functions in so we can iterate them on the device. +void *Buffer = +allocate(Funcs.size() * sizeof(void *), nullptr, TARGET_ALLOC_SHARED); +if (!Buffer) + return Plugin::error("Failed to allocate memory for global buffer"); + +auto *GlobalPtrStart = reinterpret_cast(Buffer); +auto *GlobalPtrStop = reinterpret_cast(Buffer) + Funcs.size(); + +std::size_t Idx = 0; +for (auto [Name, Priority] : Funcs) { + GlobalTy FunctionAddr(Name.str(), sizeof(void *), &GlobalPtrStart[Idx++]); + if (auto Err = Handler.readGlobalFromDevice(*this, Image, FunctionAddr)) +return std::move(Err); +} + +// Copy the created buffer to the appropriate symbols so the kernel can +// iterate through them. +GlobalTy StartGlobal(IsCtor ? "__init_array_start" : "__fini_array_start", + sizeof(void *), &GlobalPtrStart); +if (auto Err = Handler.writeGlobalToDevice(*this, Image, StartGlobal)) + return std::move(Err); + +GlobalTy StopGlobal(IsCtor ? "__init_array_end" : "__fini_array_end", +sizeof(void *), &GlobalPtrStop); +if (auto Err = Handler.writeGlobalToDevice(*this, Image, StopGlobal)) + return std::move(Err); + +// Launch the kernel to execute the functions in the buffer. +GenericKernelTy *CUDAKernel = Plugin.allocate(); +if (!CUDAKernel) + return Plugin::error("Failed to allocate memory for CUDA kernel"); + +new (CUDAKernel) +CUDAKernelTy(IsCtor ? "nvptx$device$init" : "nvptx$device$fini"); jdoerfert wrote: > IsCtor ? "nvptx$device$init" : "nvptx$device$fini" Do this once, other such ternaries as well. https://github.com/llvm/llvm-project/pull/71739 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [openmp] [OpenMP] Rework handling of global ctor/dtors in OpenMP (PR #71739)
@@ -2794,6 +2794,14 @@ void ItaniumCXXABI::registerGlobalDtor(CodeGenFunction &CGF, const VarDecl &D, if (D.isNoDestroy(CGM.getContext())) return; + // OpenMP offloading supports C++ constructors and destructors but we do not + // always have 'atexit' available. Instead lower these to use the LLVM global + // destructors which we can handle directly in the runtime. + if (CGM.getLangOpts().OpenMP && CGM.getLangOpts().OpenMPIsTargetDevice && + !D.isStaticLocal() && + (CGM.getTriple().isAMDGPU() || CGM.getTriple().isNVPTX())) arsenm wrote: Would also just hide this in a target/lang predicate that lists these https://github.com/llvm/llvm-project/pull/71739 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [openmp] [OpenMP] Rework handling of global ctor/dtors in OpenMP (PR #71739)
@@ -2627,6 +2637,48 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { using AMDGPUEventRef = AMDGPUResourceRef; using AMDGPUEventManagerTy = GenericDeviceResourceManagerTy; + /// Common method to invoke a single threaded constructor or destructor + /// kernel by name. + Error callGlobalCtorDtorCommon(GenericPluginTy &Plugin, DeviceImageTy &Image, + const char *Name) { +// Perform a quick check for the named kernel in the image. The kernel +// should be created by the 'amdgpu-lower-ctor-dtor' pass. +GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler(); +GlobalTy Global(Name, sizeof(void *)); +if (auto Err = Handler.getGlobalMetadataFromImage(*this, Image, Global)) { + consumeError(std::move(Err)); + return Error::success(); +} + +// Allocate and construct the AMDGPU kernel. +GenericKernelTy *AMDGPUKernel = Plugin.allocate(); +if (!AMDGPUKernel) + return Plugin::error("Failed to allocate memory for AMDGPU kernel"); + +new (AMDGPUKernel) AMDGPUKernelTy(Name); +if (auto Err = AMDGPUKernel->initImpl(*this, Image)) + return std::move(Err); + +auto *AsyncInfoPtr = Plugin.allocate<__tgt_async_info>(); +AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfoPtr); + +if (auto Err = initAsyncInfoImpl(AsyncInfoWrapper)) + return std::move(Err); + +KernelArgsTy KernelArgs = {}; +if (auto Err = AMDGPUKernel->launchImpl(*this, /*NumThread=*/1u, +/*NumBlocks=*/1ul, KernelArgs, +/*Args=*/nullptr, AsyncInfoWrapper)) + return std::move(Err); + +if (auto Err = synchronize(AsyncInfoPtr)) + return std::move(Err); +Error Err = Error::success(); jplehr wrote: Should this be `Plugin::success()` instead here as well? https://github.com/llvm/llvm-project/pull/71739 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits