[llvm] [clang] [openmp] [OpenMP] Rework handling of global ctor/dtors in OpenMP (PR #71739)

2023-11-10 Thread Joseph Huber via cfe-commits

https://github.com/jhuber6 closed 
https://github.com/llvm/llvm-project/pull/71739
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [clang] [openmp] [OpenMP] Rework handling of global ctor/dtors in OpenMP (PR #71739)

2023-11-10 Thread Johannes Doerfert via cfe-commits


@@ -1038,6 +1048,109 @@ struct CUDADeviceTy : public GenericDeviceTy {
   using CUDAStreamManagerTy = GenericDeviceResourceManagerTy;
   using CUDAEventManagerTy = GenericDeviceResourceManagerTy;
 
+  Error callGlobalCtorDtorCommon(GenericPluginTy , DeviceImageTy ,
+ bool IsCtor) {
+// Perform a quick check for the named kernel in the image. The kernel
+// should be created by the 'nvptx-lower-ctor-dtor' pass.
+GenericGlobalHandlerTy  = Plugin.getGlobalHandler();
+GlobalTy Global(IsCtor ? "nvptx$device$init" : "nvptx$device$fini",
+sizeof(void *));
+if (auto Err = Handler.getGlobalMetadataFromImage(*this, Image, Global)) {
+  consumeError(std::move(Err));
+  return Plugin::success();
+}
+
+// The Nvidia backend cannot handle creating the ctor / dtor array
+// automatically so we must create it ourselves. The backend will emit
+// several globals that contain function pointers we can call. These are
+// prefixed with a known name due to Nvidia's lack of section support.
+const ELF64LEObjectFile *ELFObj =
+Handler.getOrCreateELFObjectFile(*this, Image);
+if (!ELFObj)
+  return Plugin::error("Unable to create ELF object for image %p",
+   Image.getStart());
+
+// Search for all symbols that contain a constructor or destructor.
+SmallVector> Funcs;
+for (ELFSymbolRef Sym : ELFObj->symbols()) {
+  auto NameOrErr = Sym.getName();
+  if (!NameOrErr)
+return NameOrErr.takeError();
+
+  if (!NameOrErr->starts_with(IsCtor ? "__init_array_object_"
+ : "__fini_array_object_"))
+continue;
+
+  uint16_t priority;
+  if (NameOrErr->rsplit('_').second.getAsInteger(10, priority))
+return Plugin::error("Invalid priority for constructor or destructor");
+
+  Funcs.emplace_back(*NameOrErr, priority);
+}
+
+// Sort the created array to be in priority order.
+llvm::sort(Funcs, [=](auto x, auto y) { return x.second < y.second; });
+
+// Allocate a buffer to store all of the known constructor / destructor
+// functions in so we can iterate them on the device.
+void *Buffer =
+allocate(Funcs.size() * sizeof(void *), nullptr, TARGET_ALLOC_SHARED);

jdoerfert wrote:

I'm more worried about systems that do not have support than about the time.
If you think it's always supported, we can keep it for now.

https://github.com/llvm/llvm-project/pull/71739
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [clang] [openmp] [OpenMP] Rework handling of global ctor/dtors in OpenMP (PR #71739)

2023-11-10 Thread Johannes Doerfert via cfe-commits


@@ -1038,6 +1048,109 @@ struct CUDADeviceTy : public GenericDeviceTy {
   using CUDAStreamManagerTy = GenericDeviceResourceManagerTy;
   using CUDAEventManagerTy = GenericDeviceResourceManagerTy;
 
+  Error callGlobalCtorDtorCommon(GenericPluginTy , DeviceImageTy ,
+ bool IsCtor) {
+// Perform a quick check for the named kernel in the image. The kernel
+// should be created by the 'nvptx-lower-ctor-dtor' pass.
+GenericGlobalHandlerTy  = Plugin.getGlobalHandler();
+GlobalTy Global(IsCtor ? "nvptx$device$init" : "nvptx$device$fini",
+sizeof(void *));
+if (auto Err = Handler.getGlobalMetadataFromImage(*this, Image, Global)) {
+  consumeError(std::move(Err));
+  return Plugin::success();
+}
+
+// The Nvidia backend cannot handle creating the ctor / dtor array
+// automatically so we must create it ourselves. The backend will emit
+// several globals that contain function pointers we can call. These are
+// prefixed with a known name due to Nvidia's lack of section support.
+const ELF64LEObjectFile *ELFObj =
+Handler.getOrCreateELFObjectFile(*this, Image);
+if (!ELFObj)
+  return Plugin::error("Unable to create ELF object for image %p",
+   Image.getStart());
+
+// Search for all symbols that contain a constructor or destructor.
+SmallVector> Funcs;
+for (ELFSymbolRef Sym : ELFObj->symbols()) {
+  auto NameOrErr = Sym.getName();
+  if (!NameOrErr)
+return NameOrErr.takeError();
+
+  if (!NameOrErr->starts_with(IsCtor ? "__init_array_object_"
+ : "__fini_array_object_"))
+continue;
+
+  uint16_t priority;
+  if (NameOrErr->rsplit('_').second.getAsInteger(10, priority))
+return Plugin::error("Invalid priority for constructor or destructor");
+
+  Funcs.emplace_back(*NameOrErr, priority);
+}
+
+// Sort the created array to be in priority order.
+llvm::sort(Funcs, [=](auto x, auto y) { return x.second < y.second; });
+
+// Allocate a buffer to store all of the known constructor / destructor
+// functions in so we can iterate them on the device.
+void *Buffer =
+allocate(Funcs.size() * sizeof(void *), nullptr, TARGET_ALLOC_SHARED);
+if (!Buffer)
+  return Plugin::error("Failed to allocate memory for global buffer");
+
+auto *GlobalPtrStart = reinterpret_cast(Buffer);
+auto *GlobalPtrStop = reinterpret_cast(Buffer) + Funcs.size();
+
+std::size_t Idx = 0;
+for (auto [Name, Priority] : Funcs) {
+  GlobalTy FunctionAddr(Name.str(), sizeof(void *), 
[Idx++]);
+  if (auto Err = Handler.readGlobalFromDevice(*this, Image, FunctionAddr))
+return std::move(Err);
+}
+
+// Copy the created buffer to the appropriate symbols so the kernel can
+// iterate through them.
+GlobalTy StartGlobal(IsCtor ? "__init_array_start" : "__fini_array_start",
+ sizeof(void *), );
+if (auto Err = Handler.writeGlobalToDevice(*this, Image, StartGlobal))
+  return std::move(Err);
+
+GlobalTy StopGlobal(IsCtor ? "__init_array_end" : "__fini_array_end",
+sizeof(void *), );
+if (auto Err = Handler.writeGlobalToDevice(*this, Image, StopGlobal))
+  return std::move(Err);
+
+// Launch the kernel to execute the functions in the buffer.
+GenericKernelTy *CUDAKernel = Plugin.allocate();
+if (!CUDAKernel)
+  return Plugin::error("Failed to allocate memory for CUDA kernel");
+
+new (CUDAKernel)
+CUDAKernelTy(IsCtor ? "nvptx$device$init" : "nvptx$device$fini");
+
+if (auto Err = CUDAKernel->init(*this, Image))
+  return std::move(Err);
+
+AsyncInfoWrapperTy AsyncInfoWrapper(*this, nullptr);
+
+if (auto Err = initAsyncInfoImpl(AsyncInfoWrapper))

jdoerfert wrote:

You shouldn't need this.

https://github.com/llvm/llvm-project/pull/71739
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [clang] [openmp] [OpenMP] Rework handling of global ctor/dtors in OpenMP (PR #71739)

2023-11-10 Thread Johannes Doerfert via cfe-commits


@@ -1038,6 +1048,109 @@ struct CUDADeviceTy : public GenericDeviceTy {
   using CUDAStreamManagerTy = GenericDeviceResourceManagerTy;
   using CUDAEventManagerTy = GenericDeviceResourceManagerTy;
 
+  Error callGlobalCtorDtorCommon(GenericPluginTy , DeviceImageTy ,
+ bool IsCtor) {
+// Perform a quick check for the named kernel in the image. The kernel
+// should be created by the 'nvptx-lower-ctor-dtor' pass.
+GenericGlobalHandlerTy  = Plugin.getGlobalHandler();
+GlobalTy Global(IsCtor ? "nvptx$device$init" : "nvptx$device$fini",
+sizeof(void *));
+if (auto Err = Handler.getGlobalMetadataFromImage(*this, Image, Global)) {
+  consumeError(std::move(Err));
+  return Plugin::success();
+}
+
+// The Nvidia backend cannot handle creating the ctor / dtor array
+// automatically so we must create it ourselves. The backend will emit
+// several globals that contain function pointers we can call. These are
+// prefixed with a known name due to Nvidia's lack of section support.
+const ELF64LEObjectFile *ELFObj =
+Handler.getOrCreateELFObjectFile(*this, Image);
+if (!ELFObj)
+  return Plugin::error("Unable to create ELF object for image %p",
+   Image.getStart());
+
+// Search for all symbols that contain a constructor or destructor.
+SmallVector> Funcs;
+for (ELFSymbolRef Sym : ELFObj->symbols()) {
+  auto NameOrErr = Sym.getName();
+  if (!NameOrErr)
+return NameOrErr.takeError();
+
+  if (!NameOrErr->starts_with(IsCtor ? "__init_array_object_"
+ : "__fini_array_object_"))
+continue;
+
+  uint16_t priority;
+  if (NameOrErr->rsplit('_').second.getAsInteger(10, priority))
+return Plugin::error("Invalid priority for constructor or destructor");
+
+  Funcs.emplace_back(*NameOrErr, priority);
+}
+
+// Sort the created array to be in priority order.
+llvm::sort(Funcs, [=](auto x, auto y) { return x.second < y.second; });
+
+// Allocate a buffer to store all of the known constructor / destructor
+// functions in so we can iterate them on the device.
+void *Buffer =
+allocate(Funcs.size() * sizeof(void *), nullptr, TARGET_ALLOC_SHARED);
+if (!Buffer)
+  return Plugin::error("Failed to allocate memory for global buffer");
+
+auto *GlobalPtrStart = reinterpret_cast(Buffer);
+auto *GlobalPtrStop = reinterpret_cast(Buffer) + Funcs.size();
+
+std::size_t Idx = 0;
+for (auto [Name, Priority] : Funcs) {
+  GlobalTy FunctionAddr(Name.str(), sizeof(void *), 
[Idx++]);
+  if (auto Err = Handler.readGlobalFromDevice(*this, Image, FunctionAddr))
+return std::move(Err);
+}
+
+// Copy the created buffer to the appropriate symbols so the kernel can
+// iterate through them.
+GlobalTy StartGlobal(IsCtor ? "__init_array_start" : "__fini_array_start",
+ sizeof(void *), );
+if (auto Err = Handler.writeGlobalToDevice(*this, Image, StartGlobal))
+  return std::move(Err);
+
+GlobalTy StopGlobal(IsCtor ? "__init_array_end" : "__fini_array_end",
+sizeof(void *), );
+if (auto Err = Handler.writeGlobalToDevice(*this, Image, StopGlobal))
+  return std::move(Err);
+
+// Launch the kernel to execute the functions in the buffer.
+GenericKernelTy *CUDAKernel = Plugin.allocate();
+if (!CUDAKernel)
+  return Plugin::error("Failed to allocate memory for CUDA kernel");
+
+new (CUDAKernel)
+CUDAKernelTy(IsCtor ? "nvptx$device$init" : "nvptx$device$fini");

jdoerfert wrote:

> IsCtor ? "nvptx$device$init" : "nvptx$device$fini"

Do this once, other such ternaries as well.

https://github.com/llvm/llvm-project/pull/71739
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [clang] [openmp] [OpenMP] Rework handling of global ctor/dtors in OpenMP (PR #71739)

2023-11-09 Thread Matt Arsenault via cfe-commits


@@ -2794,6 +2794,14 @@ void ItaniumCXXABI::registerGlobalDtor(CodeGenFunction 
, const VarDecl ,
   if (D.isNoDestroy(CGM.getContext()))
 return;
 
+  // OpenMP offloading supports C++ constructors and destructors but we do not
+  // always have 'atexit' available. Instead lower these to use the LLVM global
+  // destructors which we can handle directly in the runtime.
+  if (CGM.getLangOpts().OpenMP && CGM.getLangOpts().OpenMPIsTargetDevice &&
+  !D.isStaticLocal() &&
+  (CGM.getTriple().isAMDGPU() || CGM.getTriple().isNVPTX()))

arsenm wrote:

Would also just hide this in a target/lang predicate that lists these 

https://github.com/llvm/llvm-project/pull/71739
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [clang] [openmp] [OpenMP] Rework handling of global ctor/dtors in OpenMP (PR #71739)

2023-11-09 Thread Jan Patrick Lehr via cfe-commits


@@ -2627,6 +2637,48 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, 
AMDGenericDeviceTy {
   using AMDGPUEventRef = AMDGPUResourceRef;
   using AMDGPUEventManagerTy = GenericDeviceResourceManagerTy;
 
+  /// Common method to invoke a single threaded constructor or destructor
+  /// kernel by name.
+  Error callGlobalCtorDtorCommon(GenericPluginTy , DeviceImageTy ,
+ const char *Name) {
+// Perform a quick check for the named kernel in the image. The kernel
+// should be created by the 'amdgpu-lower-ctor-dtor' pass.
+GenericGlobalHandlerTy  = Plugin.getGlobalHandler();
+GlobalTy Global(Name, sizeof(void *));
+if (auto Err = Handler.getGlobalMetadataFromImage(*this, Image, Global)) {
+  consumeError(std::move(Err));
+  return Error::success();
+}
+
+// Allocate and construct the AMDGPU kernel.
+GenericKernelTy *AMDGPUKernel = Plugin.allocate();
+if (!AMDGPUKernel)
+  return Plugin::error("Failed to allocate memory for AMDGPU kernel");
+
+new (AMDGPUKernel) AMDGPUKernelTy(Name);
+if (auto Err = AMDGPUKernel->initImpl(*this, Image))
+  return std::move(Err);
+
+auto *AsyncInfoPtr = Plugin.allocate<__tgt_async_info>();
+AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfoPtr);
+
+if (auto Err = initAsyncInfoImpl(AsyncInfoWrapper))
+  return std::move(Err);
+
+KernelArgsTy KernelArgs = {};
+if (auto Err = AMDGPUKernel->launchImpl(*this, /*NumThread=*/1u,
+/*NumBlocks=*/1ul, KernelArgs,
+/*Args=*/nullptr, 
AsyncInfoWrapper))
+  return std::move(Err);
+
+if (auto Err = synchronize(AsyncInfoPtr))
+  return std::move(Err);
+Error Err = Error::success();

jplehr wrote:

Should this be `Plugin::success()` instead here as well?

https://github.com/llvm/llvm-project/pull/71739
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits