[PATCH] D123812: [CUDA] Add wrapper code generation for registering CUDA images

2022-05-11 Thread Joseph Huber via Phabricator via cfe-commits
This revision was automatically updated to reflect the committed changes.
Closed by commit rGf49d576a882d: [CUDA] Add wrapper code generation for 
registering CUDA images (authored by jhuber6).

Changed prior to commit:
  https://reviews.llvm.org/D123812?vs=426467=428622#toc

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D123812/new/

https://reviews.llvm.org/D123812

Files:
  clang/test/Driver/linker-wrapper-image.c
  clang/test/Driver/linker-wrapper.c
  clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
  clang/tools/clang-linker-wrapper/OffloadWrapper.cpp

Index: clang/tools/clang-linker-wrapper/OffloadWrapper.cpp
===
--- clang/tools/clang-linker-wrapper/OffloadWrapper.cpp
+++ clang/tools/clang-linker-wrapper/OffloadWrapper.cpp
@@ -20,6 +20,8 @@
 using namespace llvm;
 
 namespace {
+/// Magic number that begins the section containing the CUDA fatbinary.
+constexpr unsigned CudaFatMagic = 0x466243b1;
 
 IntegerType *getSizeTTy(Module ) {
   LLVMContext  = M.getContext();
@@ -255,6 +257,278 @@
   appendToGlobalDtors(M, Func, /*Priority*/ 1);
 }
 
+// struct fatbin_wrapper {
+//  int32_t magic;
+//  int32_t version;
+//  void *image;
+//  void *reserved;
+//};
+StructType *getFatbinWrapperTy(Module ) {
+  LLVMContext  = M.getContext();
+  StructType *FatbinTy = StructType::getTypeByName(C, "fatbin_wrapper");
+  if (!FatbinTy)
+FatbinTy = StructType::create("fatbin_wrapper", Type::getInt32Ty(C),
+  Type::getInt32Ty(C), Type::getInt8PtrTy(C),
+  Type::getInt8PtrTy(C));
+  return FatbinTy;
+}
+
+/// Embed the image \p Image into the module \p M so it can be found by the
+/// runtime.
+GlobalVariable *createFatbinDesc(Module , ArrayRef Image) {
+  LLVMContext  = M.getContext();
+  llvm::Type *Int8PtrTy = Type::getInt8PtrTy(C);
+  llvm::Triple Triple = llvm::Triple(M.getTargetTriple());
+
+  // Create the global string containing the fatbinary.
+  StringRef FatbinConstantSection =
+  Triple.isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
+  auto *Data = ConstantDataArray::get(C, Image);
+  auto *Fatbin = new GlobalVariable(M, Data->getType(), /*isConstant*/ true,
+GlobalVariable::InternalLinkage, Data,
+".fatbin_image");
+  Fatbin->setSection(FatbinConstantSection);
+
+  // Create the fatbinary wrapper
+  StringRef FatbinWrapperSection =
+  Triple.isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
+  Constant *FatbinWrapper[] = {
+  ConstantInt::get(Type::getInt32Ty(C), CudaFatMagic),
+  ConstantInt::get(Type::getInt32Ty(C), 1),
+  ConstantExpr::getPointerBitCastOrAddrSpaceCast(Fatbin, Int8PtrTy),
+  ConstantPointerNull::get(Type::getInt8PtrTy(C))};
+
+  Constant *FatbinInitializer =
+  ConstantStruct::get(getFatbinWrapperTy(M), FatbinWrapper);
+
+  auto *FatbinDesc =
+  new GlobalVariable(M, getFatbinWrapperTy(M),
+ /*isConstant*/ true, GlobalValue::InternalLinkage,
+ FatbinInitializer, ".fatbin_wrapper");
+  FatbinDesc->setSection(FatbinWrapperSection);
+  FatbinDesc->setAlignment(Align(8));
+
+  // We create a dummy entry to ensure the linker will define the begin / end
+  // symbols. The CUDA runtime should ignore the null address if we attempt to
+  // register it.
+  auto *DummyInit =
+  ConstantAggregateZero::get(ArrayType::get(getEntryTy(M), 0u));
+  auto *DummyEntry = new GlobalVariable(
+  M, DummyInit->getType(), true, GlobalVariable::ExternalLinkage, DummyInit,
+  "__dummy.cuda_offloading.entry");
+  DummyEntry->setSection("cuda_offloading_entries");
+  DummyEntry->setVisibility(GlobalValue::HiddenVisibility);
+
+  return FatbinDesc;
+}
+
+/// Create the register globals function. We will iterate all of the offloading
+/// entries stored at the begin / end symbols and register them according to
+/// their type. This creates the following function in IR:
+///
+/// extern struct __tgt_offload_entry __start_cuda_offloading_entries;
+/// extern struct __tgt_offload_entry __stop_cuda_offloading_entries;
+///
+/// extern void __cudaRegisterFunction(void **, void *, void *, void *, int,
+///void *, void *, void *, void *, int *);
+/// extern void __cudaRegisterVar(void **, void *, void *, void *, int32_t,
+///   int64_t, int32_t, int32_t);
+///
+/// void __cudaRegisterTest(void **fatbinHandle) {
+///   for (struct __tgt_offload_entry *entry = &__start_cuda_offloading_entries;
+///entry != &__stop_cuda_offloading_entries; ++entry) {
+/// if (!entry->size)
+///   __cudaRegisterFunction(fatbinHandle, entry->addr, entry->name,
+///  entry->name, -1, 0, 0, 0, 0, 0);
+/// else
+///   __cudaRegisterVar(fatbinHandle, entry->addr, entry->name, 

[PATCH] D123812: [CUDA] Add wrapper code generation for registering CUDA images

2022-05-10 Thread Joseph Huber via Phabricator via cfe-commits
jhuber6 added inline comments.



Comment at: clang/tools/clang-linker-wrapper/OffloadWrapper.cpp:351
+///   required for texture / surface / managed variables.
+Function *createRegisterGlobalsFunction(Module ) {
+  LLVMContext  = M.getContext();

tra wrote:
> jdoerfert wrote:
> > jhuber6 wrote:
> > > tra wrote:
> > > > jhuber6 wrote:
> > > > > tra wrote:
> > > > > > Do you think generation of the CUDA registration glue could be 
> > > > > > shared with the front-end?
> > > > > > 
> > > > > I was thinking about it, but ultimately decided to keep the noise 
> > > > > outside of the new driver to a minimum. Maybe if we move to the 
> > > > > offloading entries being a common format we can easily share this 
> > > > > code. Keeping it in Clang would have the advantage that it's easier 
> > > > > to test directly and ensures we don't de-sync if anything changes. 
> > > > > The only downside is that in the future I may want to push this 
> > > > > functionality to a linker plugin or similar, which would require 
> > > > > pulling it out of Clang again to prevent us needing to link in Clang 
> > > > > to build LLVM.
> > > > > 
> > > > > Also needing to do this all through the builder API isn't ideal, it 
> > > > > would be nice if we had some kind of runtime to call to do this for 
> > > > > us, but I didn't feel like adding yet another shared library for 
> > > > > CUDA. I considered putting it inside the cuda header wrappers as 
> > > > > well, but forcing every CUDA file to have some externally visible 
> > > > > weak registration blob didn't sit well with me.
> > > > Perhaps front-end is not the right place for it, indeed. LLVM itself 
> > > > may be a better choice. We already have some things there for somewhat 
> > > > similar purposes (like lib/WindowsManifest) so adding a helper function 
> > > > to generate runtime glue for CUDA should not be unreasonable.
> > > I think it's fine here for this patch, but I definitely want to move it 
> > > into LLVM in the future once I start generalizing more of this stuff.
> > I'm OK with it being here but the place to consider (IMHO) is 
> > `llvm/lib/Frontend`, maybe `/CUDA/Register.cpp`.
> OK. I'm fine keeping it all here for now.
> Please add a comment pointing towards the origin of this code. and, maybe, a 
> TODO item to consolidate and move it into a better place.
Will do, thanks for the reviews. I'll land these tomorrow morning and see if 
anything breaks.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D123812/new/

https://reviews.llvm.org/D123812

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D123812: [CUDA] Add wrapper code generation for registering CUDA images

2022-05-10 Thread Artem Belevich via Phabricator via cfe-commits
tra accepted this revision.
tra added inline comments.
This revision is now accepted and ready to land.



Comment at: clang/tools/clang-linker-wrapper/OffloadWrapper.cpp:351
+///   required for texture / surface / managed variables.
+Function *createRegisterGlobalsFunction(Module ) {
+  LLVMContext  = M.getContext();

jdoerfert wrote:
> jhuber6 wrote:
> > tra wrote:
> > > jhuber6 wrote:
> > > > tra wrote:
> > > > > Do you think generation of the CUDA registration glue could be shared 
> > > > > with the front-end?
> > > > > 
> > > > I was thinking about it, but ultimately decided to keep the noise 
> > > > outside of the new driver to a minimum. Maybe if we move to the 
> > > > offloading entries being a common format we can easily share this code. 
> > > > Keeping it in Clang would have the advantage that it's easier to test 
> > > > directly and ensures we don't de-sync if anything changes. The only 
> > > > downside is that in the future I may want to push this functionality to 
> > > > a linker plugin or similar, which would require pulling it out of Clang 
> > > > again to prevent us needing to link in Clang to build LLVM.
> > > > 
> > > > Also needing to do this all through the builder API isn't ideal, it 
> > > > would be nice if we had some kind of runtime to call to do this for us, 
> > > > but I didn't feel like adding yet another shared library for CUDA. I 
> > > > considered putting it inside the cuda header wrappers as well, but 
> > > > forcing every CUDA file to have some externally visible weak 
> > > > registration blob didn't sit well with me.
> > > Perhaps front-end is not the right place for it, indeed. LLVM itself may 
> > > be a better choice. We already have some things there for somewhat 
> > > similar purposes (like lib/WindowsManifest) so adding a helper function 
> > > to generate runtime glue for CUDA should not be unreasonable.
> > I think it's fine here for this patch, but I definitely want to move it 
> > into LLVM in the future once I start generalizing more of this stuff.
> I'm OK with it being here but the place to consider (IMHO) is 
> `llvm/lib/Frontend`, maybe `/CUDA/Register.cpp`.
OK. I'm fine keeping it all here for now.
Please add a comment pointing towards the origin of this code. and, maybe, a 
TODO item to consolidate and move it into a better place.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D123812/new/

https://reviews.llvm.org/D123812

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D123812: [CUDA] Add wrapper code generation for registering CUDA images

2022-05-06 Thread Joseph Huber via Phabricator via cfe-commits
jhuber6 added a comment.

In D123812#3496914 , @yaxunl wrote:

> LGTM.

Did you forget to accept the revision? D123810 
 and D123471 
 still need to be looked at, but these are 
mostly non-intrusive so I don't think they'll break anything.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D123812/new/

https://reviews.llvm.org/D123812

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D123812: [CUDA] Add wrapper code generation for registering CUDA images

2022-05-06 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl added a comment.

LGTM.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D123812/new/

https://reviews.llvm.org/D123812

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D123812: [CUDA] Add wrapper code generation for registering CUDA images

2022-05-06 Thread Johannes Doerfert via Phabricator via cfe-commits
jdoerfert added a comment.

Are there unresolved concerns we should address in this review?




Comment at: clang/tools/clang-linker-wrapper/OffloadWrapper.cpp:351
+///   required for texture / surface / managed variables.
+Function *createRegisterGlobalsFunction(Module ) {
+  LLVMContext  = M.getContext();

jhuber6 wrote:
> tra wrote:
> > jhuber6 wrote:
> > > tra wrote:
> > > > Do you think generation of the CUDA registration glue could be shared 
> > > > with the front-end?
> > > > 
> > > I was thinking about it, but ultimately decided to keep the noise outside 
> > > of the new driver to a minimum. Maybe if we move to the offloading 
> > > entries being a common format we can easily share this code. Keeping it 
> > > in Clang would have the advantage that it's easier to test directly and 
> > > ensures we don't de-sync if anything changes. The only downside is that 
> > > in the future I may want to push this functionality to a linker plugin or 
> > > similar, which would require pulling it out of Clang again to prevent us 
> > > needing to link in Clang to build LLVM.
> > > 
> > > Also needing to do this all through the builder API isn't ideal, it would 
> > > be nice if we had some kind of runtime to call to do this for us, but I 
> > > didn't feel like adding yet another shared library for CUDA. I considered 
> > > putting it inside the cuda header wrappers as well, but forcing every 
> > > CUDA file to have some externally visible weak registration blob didn't 
> > > sit well with me.
> > Perhaps front-end is not the right place for it, indeed. LLVM itself may be 
> > a better choice. We already have some things there for somewhat similar 
> > purposes (like lib/WindowsManifest) so adding a helper function to generate 
> > runtime glue for CUDA should not be unreasonable.
> I think it's fine here for this patch, but I definitely want to move it into 
> LLVM in the future once I start generalizing more of this stuff.
I'm OK with it being here but the place to consider (IMHO) is 
`llvm/lib/Frontend`, maybe `/CUDA/Register.cpp`.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D123812/new/

https://reviews.llvm.org/D123812

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D123812: [CUDA] Add wrapper code generation for registering CUDA images

2022-05-06 Thread Joseph Huber via Phabricator via cfe-commits
jhuber6 added a comment.

ping


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D123812/new/

https://reviews.llvm.org/D123812

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D123812: [CUDA] Add wrapper code generation for registering CUDA images

2022-05-02 Thread Joseph Huber via Phabricator via cfe-commits
jhuber6 updated this revision to Diff 426467.
jhuber6 added a comment.

Updating code generation. Previously we would seg-fault in the case that no 
offloading entries were created. To solve this we simply check that Begin != 
End before trying to register anything.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D123812/new/

https://reviews.llvm.org/D123812

Files:
  clang/test/Driver/linker-wrapper-image.c
  clang/test/Driver/linker-wrapper.c
  clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
  clang/tools/clang-linker-wrapper/OffloadWrapper.cpp

Index: clang/tools/clang-linker-wrapper/OffloadWrapper.cpp
===
--- clang/tools/clang-linker-wrapper/OffloadWrapper.cpp
+++ clang/tools/clang-linker-wrapper/OffloadWrapper.cpp
@@ -20,6 +20,8 @@
 using namespace llvm;
 
 namespace {
+/// Magic number that begins the section containing the CUDA fatbinary.
+constexpr unsigned CudaFatMagic = 0x466243b1;
 
 IntegerType *getSizeTTy(Module ) {
   LLVMContext  = M.getContext();
@@ -255,6 +257,278 @@
   appendToGlobalDtors(M, Func, /*Priority*/ 1);
 }
 
+// struct fatbin_wrapper {
+//  int32_t magic;
+//  int32_t version;
+//  void *image;
+//  void *reserved;
+//};
+StructType *getFatbinWrapperTy(Module ) {
+  LLVMContext  = M.getContext();
+  StructType *FatbinTy = StructType::getTypeByName(C, "fatbin_wrapper");
+  if (!FatbinTy)
+FatbinTy = StructType::create("fatbin_wrapper", Type::getInt32Ty(C),
+  Type::getInt32Ty(C), Type::getInt8PtrTy(C),
+  Type::getInt8PtrTy(C));
+  return FatbinTy;
+}
+
+/// Embed the image \p Image into the module \p M so it can be found by the
+/// runtime.
+GlobalVariable *createFatbinDesc(Module , ArrayRef Image) {
+  LLVMContext  = M.getContext();
+  llvm::Type *Int8PtrTy = Type::getInt8PtrTy(C);
+  llvm::Triple Triple = llvm::Triple(M.getTargetTriple());
+
+  // Create the global string containing the fatbinary.
+  StringRef FatbinConstantSection =
+  Triple.isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
+  auto *Data = ConstantDataArray::get(C, Image);
+  auto *Fatbin = new GlobalVariable(M, Data->getType(), /*isConstant*/ true,
+GlobalVariable::InternalLinkage, Data,
+".fatbin_image");
+  Fatbin->setSection(FatbinConstantSection);
+
+  // Create the fatbinary wrapper
+  StringRef FatbinWrapperSection =
+  Triple.isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
+  Constant *FatbinWrapper[] = {
+  ConstantInt::get(Type::getInt32Ty(C), CudaFatMagic),
+  ConstantInt::get(Type::getInt32Ty(C), 1),
+  ConstantExpr::getPointerBitCastOrAddrSpaceCast(Fatbin, Int8PtrTy),
+  ConstantPointerNull::get(Type::getInt8PtrTy(C))};
+
+  Constant *FatbinInitializer =
+  ConstantStruct::get(getFatbinWrapperTy(M), FatbinWrapper);
+
+  auto *FatbinDesc =
+  new GlobalVariable(M, getFatbinWrapperTy(M),
+ /*isConstant*/ true, GlobalValue::InternalLinkage,
+ FatbinInitializer, ".fatbin_wrapper");
+  FatbinDesc->setSection(FatbinWrapperSection);
+  FatbinDesc->setAlignment(Align(8));
+
+  // We create a dummy entry to ensure the linker will define the begin / end
+  // symbols. The CUDA runtime should ignore the null address if we attempt to
+  // register it.
+  auto *DummyInit =
+  ConstantAggregateZero::get(ArrayType::get(getEntryTy(M), 0u));
+  auto *DummyEntry = new GlobalVariable(
+  M, DummyInit->getType(), true, GlobalVariable::ExternalLinkage, DummyInit,
+  "__dummy.cuda_offloading.entry");
+  DummyEntry->setSection("cuda_offloading_entries");
+  DummyEntry->setVisibility(GlobalValue::HiddenVisibility);
+
+  return FatbinDesc;
+}
+
+/// Create the register globals function. We will iterate all of the offloading
+/// entries stored at the begin / end symbols and register them according to
+/// their type. This creates the following function in IR:
+///
+/// extern struct __tgt_offload_entry __start_cuda_offloading_entries;
+/// extern struct __tgt_offload_entry __stop_cuda_offloading_entries;
+///
+/// extern void __cudaRegisterFunction(void **, void *, void *, void *, int,
+///void *, void *, void *, void *, int *);
+/// extern void __cudaRegisterVar(void **, void *, void *, void *, int32_t,
+///   int64_t, int32_t, int32_t);
+///
+/// void __cudaRegisterTest(void **fatbinHandle) {
+///   for (struct __tgt_offload_entry *entry = &__start_cuda_offloading_entries;
+///entry != &__stop_cuda_offloading_entries; ++entry) {
+/// if (!entry->size)
+///   __cudaRegisterFunction(fatbinHandle, entry->addr, entry->name,
+///  entry->name, -1, 0, 0, 0, 0, 0);
+/// else
+///   __cudaRegisterVar(fatbinHandle, entry->addr, entry->name, entry->name,

[PATCH] D123812: [CUDA] Add wrapper code generation for registering CUDA images

2022-04-22 Thread Joseph Huber via Phabricator via cfe-commits
jhuber6 updated this revision to Diff 424536.
jhuber6 added a comment.

Applied changes to wrong commit, whoops.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D123812/new/

https://reviews.llvm.org/D123812

Files:
  clang/test/Driver/linker-wrapper-image.c
  clang/test/Driver/linker-wrapper.c
  clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
  clang/tools/clang-linker-wrapper/OffloadWrapper.cpp

Index: clang/tools/clang-linker-wrapper/OffloadWrapper.cpp
===
--- clang/tools/clang-linker-wrapper/OffloadWrapper.cpp
+++ clang/tools/clang-linker-wrapper/OffloadWrapper.cpp
@@ -20,6 +20,8 @@
 using namespace llvm;
 
 namespace {
+/// Magic number that begins the section containing the CUDA fatbinary.
+constexpr unsigned CudaFatMagic = 0x466243b1;
 
 IntegerType *getSizeTTy(Module ) {
   LLVMContext  = M.getContext();
@@ -255,6 +257,265 @@
   appendToGlobalDtors(M, Func, /*Priority*/ 1);
 }
 
+// struct fatbin_wrapper {
+//  int32_t magic;
+//  int32_t version;
+//  void *image;
+//  void *reserved;
+//};
+StructType *getFatbinWrapperTy(Module ) {
+  LLVMContext  = M.getContext();
+  StructType *FatbinTy = StructType::getTypeByName(C, "fatbin_wrapper");
+  if (!FatbinTy)
+FatbinTy = StructType::create("fatbin_wrapper", Type::getInt32Ty(C),
+  Type::getInt32Ty(C), Type::getInt8PtrTy(C),
+  Type::getInt8PtrTy(C));
+  return FatbinTy;
+}
+
+/// Embed the image \p Image into the module \p M so it can be found by the
+/// runtime.
+GlobalVariable *createFatbinDesc(Module , ArrayRef Image) {
+  LLVMContext  = M.getContext();
+  llvm::Type *Int8PtrTy = Type::getInt8PtrTy(C);
+  llvm::Triple Triple = llvm::Triple(M.getTargetTriple());
+
+  // Create the global string containing the fatbinary.
+  StringRef FatbinConstantSection =
+  Triple.isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
+  auto *Data = ConstantDataArray::get(C, Image);
+  auto *Fatbin = new GlobalVariable(M, Data->getType(), /*isConstant*/ true,
+GlobalVariable::InternalLinkage, Data,
+".fatbin_image");
+  Fatbin->setSection(FatbinConstantSection);
+
+  // Create the fatbinary wrapper
+  StringRef FatbinWrapperSection =
+  Triple.isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
+  Constant *FatbinWrapper[] = {
+  ConstantInt::get(Type::getInt32Ty(C), CudaFatMagic),
+  ConstantInt::get(Type::getInt32Ty(C), 1),
+  ConstantExpr::getPointerBitCastOrAddrSpaceCast(Fatbin, Int8PtrTy),
+  ConstantPointerNull::get(Type::getInt8PtrTy(C))};
+
+  Constant *FatbinInitializer =
+  ConstantStruct::get(getFatbinWrapperTy(M), FatbinWrapper);
+
+  auto *FatbinDesc =
+  new GlobalVariable(M, getFatbinWrapperTy(M),
+ /*isConstant*/ true, GlobalValue::InternalLinkage,
+ FatbinInitializer, ".fatbin_wrapper");
+  FatbinDesc->setSection(FatbinWrapperSection);
+  FatbinDesc->setAlignment(Align(8));
+
+  // We create a dummy entry to ensure the linker will define the begin / end
+  // symbols. The CUDA runtime should ignore the null address if we attempt to
+  // register it.
+  auto *DummyInit =
+  ConstantAggregateZero::get(ArrayType::get(getEntryTy(M), 0u));
+  auto *DummyEntry = new GlobalVariable(
+  M, DummyInit->getType(), true, GlobalVariable::ExternalLinkage, DummyInit,
+  "__dummy.cuda_offloading.entry");
+  DummyEntry->setSection("cuda_offloading_entries");
+  DummyEntry->setVisibility(GlobalValue::HiddenVisibility);
+
+  return FatbinDesc;
+}
+
+/// Create the register globals function. We will iterate all of the offloading
+/// entries stored at the begin / end symbols and register them according to
+/// their type. This creates the following function in IR:
+///
+/// extern struct __tgt_offload_entry __start_cuda_offloading_entries;
+/// extern struct __tgt_offload_entry __stop_cuda_offloading_entries;
+///
+/// extern void __cudaRegisterFunction(void **, void *, void *, void *, int,
+///void *, void *, void *, void *, int *);
+/// extern void __cudaRegisterVar(void **, void *, void *, void *, int32_t,
+///   int64_t, int32_t, int32_t);
+///
+/// void __cudaRegisterTest(void **fatbinHandle) {
+///   for (struct __tgt_offload_entry *entry = &__start_cuda_offloading_entries;
+///entry != &__stop_cuda_offloading_entries; ++entry) {
+/// if (!entry->size)
+///   __cudaRegisterFunction(fatbinHandle, entry->addr, entry->name,
+///  entry->name, -1, 0, 0, 0, 0, 0);
+/// else
+///   __cudaRegisterVar(fatbinHandle, entry->addr, entry->name, entry->name,
+/// 0, entry->size, 0, 0);
+///   }
+/// }
+///
+/// TODO: This only registers functions are variables. Additional support is

[PATCH] D123812: [CUDA] Add wrapper code generation for registering CUDA images

2022-04-22 Thread Joseph Huber via Phabricator via cfe-commits
jhuber6 updated this revision to Diff 424534.
jhuber6 added a comment.
Herald added a subscriber: MaskRay.

Adding warning for using both `-fno-gpu-rdc` and `-foffload-new-driver`. I 
think this is a good warning to have for now while this is being worked in as 
opt-in. Once this has matured I plan on adding the necessary logic to handle 
RDC and non-RDC builds correctly with this. But for the purposes of this patch 
just warning is fine.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D123812/new/

https://reviews.llvm.org/D123812

Files:
  clang/include/clang/Basic/DiagnosticDriverKinds.td
  clang/lib/Driver/ToolChains/Clang.cpp
  clang/test/Driver/linker-wrapper-image.c
  clang/test/Driver/linker-wrapper.c
  clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
  clang/tools/clang-linker-wrapper/OffloadWrapper.cpp

Index: clang/tools/clang-linker-wrapper/OffloadWrapper.cpp
===
--- clang/tools/clang-linker-wrapper/OffloadWrapper.cpp
+++ clang/tools/clang-linker-wrapper/OffloadWrapper.cpp
@@ -20,6 +20,8 @@
 using namespace llvm;
 
 namespace {
+/// Magic number that begins the section containing the CUDA fatbinary.
+constexpr unsigned CudaFatMagic = 0x466243b1;
 
 IntegerType *getSizeTTy(Module ) {
   LLVMContext  = M.getContext();
@@ -255,6 +257,265 @@
   appendToGlobalDtors(M, Func, /*Priority*/ 1);
 }
 
+// struct fatbin_wrapper {
+//  int32_t magic;
+//  int32_t version;
+//  void *image;
+//  void *reserved;
+//};
+StructType *getFatbinWrapperTy(Module ) {
+  LLVMContext  = M.getContext();
+  StructType *FatbinTy = StructType::getTypeByName(C, "fatbin_wrapper");
+  if (!FatbinTy)
+FatbinTy = StructType::create("fatbin_wrapper", Type::getInt32Ty(C),
+  Type::getInt32Ty(C), Type::getInt8PtrTy(C),
+  Type::getInt8PtrTy(C));
+  return FatbinTy;
+}
+
+/// Embed the image \p Image into the module \p M so it can be found by the
+/// runtime.
+GlobalVariable *createFatbinDesc(Module , ArrayRef Image) {
+  LLVMContext  = M.getContext();
+  llvm::Type *Int8PtrTy = Type::getInt8PtrTy(C);
+  llvm::Triple Triple = llvm::Triple(M.getTargetTriple());
+
+  // Create the global string containing the fatbinary.
+  StringRef FatbinConstantSection =
+  Triple.isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
+  auto *Data = ConstantDataArray::get(C, Image);
+  auto *Fatbin = new GlobalVariable(M, Data->getType(), /*isConstant*/ true,
+GlobalVariable::InternalLinkage, Data,
+".fatbin_image");
+  Fatbin->setSection(FatbinConstantSection);
+
+  // Create the fatbinary wrapper
+  StringRef FatbinWrapperSection =
+  Triple.isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
+  Constant *FatbinWrapper[] = {
+  ConstantInt::get(Type::getInt32Ty(C), CudaFatMagic),
+  ConstantInt::get(Type::getInt32Ty(C), 1),
+  ConstantExpr::getPointerBitCastOrAddrSpaceCast(Fatbin, Int8PtrTy),
+  ConstantPointerNull::get(Type::getInt8PtrTy(C))};
+
+  Constant *FatbinInitializer =
+  ConstantStruct::get(getFatbinWrapperTy(M), FatbinWrapper);
+
+  auto *FatbinDesc =
+  new GlobalVariable(M, getFatbinWrapperTy(M),
+ /*isConstant*/ true, GlobalValue::InternalLinkage,
+ FatbinInitializer, ".fatbin_wrapper");
+  FatbinDesc->setSection(FatbinWrapperSection);
+  FatbinDesc->setAlignment(Align(8));
+
+  // We create a dummy entry to ensure the linker will define the begin / end
+  // symbols. The CUDA runtime should ignore the null address if we attempt to
+  // register it.
+  auto *DummyInit =
+  ConstantAggregateZero::get(ArrayType::get(getEntryTy(M), 0u));
+  auto *DummyEntry = new GlobalVariable(
+  M, DummyInit->getType(), true, GlobalVariable::ExternalLinkage, DummyInit,
+  "__dummy.cuda_offloading.entry");
+  DummyEntry->setSection("cuda_offloading_entries");
+  DummyEntry->setVisibility(GlobalValue::HiddenVisibility);
+
+  return FatbinDesc;
+}
+
+/// Create the register globals function. We will iterate all of the offloading
+/// entries stored at the begin / end symbols and register them according to
+/// their type. This creates the following function in IR:
+///
+/// extern struct __tgt_offload_entry __start_cuda_offloading_entries;
+/// extern struct __tgt_offload_entry __stop_cuda_offloading_entries;
+///
+/// extern void __cudaRegisterFunction(void **, void *, void *, void *, int,
+///void *, void *, void *, void *, int *);
+/// extern void __cudaRegisterVar(void **, void *, void *, void *, int32_t,
+///   int64_t, int32_t, int32_t);
+///
+/// void __cudaRegisterTest(void **fatbinHandle) {
+///   for (struct __tgt_offload_entry *entry = &__start_cuda_offloading_entries;
+///entry != &__stop_cuda_offloading_entries; 

[PATCH] D123812: [CUDA] Add wrapper code generation for registering CUDA images

2022-04-20 Thread Joseph Huber via Phabricator via cfe-commits
jhuber6 updated this revision to Diff 423993.
jhuber6 added a comment.
Herald added a subscriber: sstefan1.

Adjusting tests.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D123812/new/

https://reviews.llvm.org/D123812

Files:
  clang/test/Driver/linker-wrapper-image.c
  clang/test/Driver/linker-wrapper.c
  clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
  clang/tools/clang-linker-wrapper/OffloadWrapper.cpp

Index: clang/tools/clang-linker-wrapper/OffloadWrapper.cpp
===
--- clang/tools/clang-linker-wrapper/OffloadWrapper.cpp
+++ clang/tools/clang-linker-wrapper/OffloadWrapper.cpp
@@ -20,6 +20,8 @@
 using namespace llvm;
 
 namespace {
+/// Magic number that begins the section containing the CUDA fatbinary.
+constexpr unsigned CudaFatMagic = 0x466243b1;
 
 IntegerType *getSizeTTy(Module ) {
   LLVMContext  = M.getContext();
@@ -255,6 +257,265 @@
   appendToGlobalDtors(M, Func, /*Priority*/ 1);
 }
 
+// struct fatbin_wrapper {
+//  int32_t magic;
+//  int32_t version;
+//  void *image;
+//  void *reserved;
+//};
+StructType *getFatbinWrapperTy(Module ) {
+  LLVMContext  = M.getContext();
+  StructType *FatbinTy = StructType::getTypeByName(C, "fatbin_wrapper");
+  if (!FatbinTy)
+FatbinTy = StructType::create("fatbin_wrapper", Type::getInt32Ty(C),
+  Type::getInt32Ty(C), Type::getInt8PtrTy(C),
+  Type::getInt8PtrTy(C));
+  return FatbinTy;
+}
+
+/// Embed the image \p Image into the module \p M so it can be found by the
+/// runtime.
+GlobalVariable *createFatbinDesc(Module , ArrayRef Image) {
+  LLVMContext  = M.getContext();
+  llvm::Type *Int8PtrTy = Type::getInt8PtrTy(C);
+  llvm::Triple Triple = llvm::Triple(M.getTargetTriple());
+
+  // Create the global string containing the fatbinary.
+  StringRef FatbinConstantSection =
+  Triple.isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
+  auto *Data = ConstantDataArray::get(C, Image);
+  auto *Fatbin = new GlobalVariable(M, Data->getType(), /*isConstant*/ true,
+GlobalVariable::InternalLinkage, Data,
+".fatbin_image");
+  Fatbin->setSection(FatbinConstantSection);
+
+  // Create the fatbinary wrapper
+  StringRef FatbinWrapperSection =
+  Triple.isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
+  Constant *FatbinWrapper[] = {
+  ConstantInt::get(Type::getInt32Ty(C), CudaFatMagic),
+  ConstantInt::get(Type::getInt32Ty(C), 1),
+  ConstantExpr::getPointerBitCastOrAddrSpaceCast(Fatbin, Int8PtrTy),
+  ConstantPointerNull::get(Type::getInt8PtrTy(C))};
+
+  Constant *FatbinInitializer =
+  ConstantStruct::get(getFatbinWrapperTy(M), FatbinWrapper);
+
+  auto *FatbinDesc =
+  new GlobalVariable(M, getFatbinWrapperTy(M),
+ /*isConstant*/ true, GlobalValue::InternalLinkage,
+ FatbinInitializer, ".fatbin_wrapper");
+  FatbinDesc->setSection(FatbinWrapperSection);
+  FatbinDesc->setAlignment(Align(8));
+
+  // We create a dummy entry to ensure the linker will define the begin / end
+  // symbols. The CUDA runtime should ignore the null address if we attempt to
+  // register it.
+  auto *DummyInit =
+  ConstantAggregateZero::get(ArrayType::get(getEntryTy(M), 0u));
+  auto *DummyEntry = new GlobalVariable(
+  M, DummyInit->getType(), true, GlobalVariable::ExternalLinkage, DummyInit,
+  "__dummy.cuda_offloading.entry");
+  DummyEntry->setSection("cuda_offloading_entries");
+  DummyEntry->setVisibility(GlobalValue::HiddenVisibility);
+
+  return FatbinDesc;
+}
+
+/// Create the register globals function. We will iterate all of the offloading
+/// entries stored at the begin / end symbols and register them according to
+/// their type. This creates the following function in IR:
+///
+/// extern struct __tgt_offload_entry __start_cuda_offloading_entries;
+/// extern struct __tgt_offload_entry __stop_cuda_offloading_entries;
+///
+/// extern void __cudaRegisterFunction(void **, void *, void *, void *, int,
+///void *, void *, void *, void *, int *);
+/// extern void __cudaRegisterVar(void **, void *, void *, void *, int32_t,
+///   int64_t, int32_t, int32_t);
+///
+/// void __cudaRegisterTest(void **fatbinHandle) {
+///   for (struct __tgt_offload_entry *entry = &__start_cuda_offloading_entries;
+///entry != &__stop_cuda_offloading_entries; ++entry) {
+/// if (!entry->size)
+///   __cudaRegisterFunction(fatbinHandle, entry->addr, entry->name,
+///  entry->name, -1, 0, 0, 0, 0, 0);
+/// else
+///   __cudaRegisterVar(fatbinHandle, entry->addr, entry->name, entry->name,
+/// 0, entry->size, 0, 0);
+///   }
+/// }
+///
+/// TODO: This only registers functions are variables. Additional 

[PATCH] D123812: [CUDA] Add wrapper code generation for registering CUDA images

2022-04-20 Thread Joseph Huber via Phabricator via cfe-commits
jhuber6 added inline comments.



Comment at: clang/test/Driver/linker-wrapper-image.c:32
+// RUN: %clang -cc1 %s -emit-obj -o %t.o \
+// RUN:   
-fembed-offload-object=%S/Inputs/dummy-elf.o,cuda,nvptx64-nvida-cuda,sm_70
+// RUN: clang-linker-wrapper --print-wrapped-module --dry-run -linker-path 
/usr/bin/ld \

yaxunl wrote:
> what happens if there are multiple binaries for different GPUs? will the 
> linker-wrapper generates one fatbinary containing both elfs and embed the 
> fatbinary as one image?
Yes, I'll add it to the other test.



Comment at: clang/test/Driver/linker-wrapper.c:46
+// RUN:   
-fembed-offload-object=%S/Inputs/dummy-elf.o,cuda,nvptx64-nvida-cuda,sm_70 \
+// RUN:   
-fembed-offload-object=%S/Inputs/dummy-elf.o,cuda,nvptx64-nvida-cuda,sm_70
+// RUN: clang-linker-wrapper --dry-run -linker-path /usr/bin/ld -- %t.o -o 
a.out \

yaxunl wrote:
> This option is the same as the preceding option. Is this intentional? Can we 
> have a test that embeds multiple binaries for different GPUs?
It's intentional to show that we can pull out two objects embedded in a single 
file (Like if someone did `ld -r` or something). I'll add binaries for 
different GPUs to show that works.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D123812/new/

https://reviews.llvm.org/D123812

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D123812: [CUDA] Add wrapper code generation for registering CUDA images

2022-04-20 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl added inline comments.



Comment at: clang/test/Driver/linker-wrapper-image.c:32
+// RUN: %clang -cc1 %s -emit-obj -o %t.o \
+// RUN:   
-fembed-offload-object=%S/Inputs/dummy-elf.o,cuda,nvptx64-nvida-cuda,sm_70
+// RUN: clang-linker-wrapper --print-wrapped-module --dry-run -linker-path 
/usr/bin/ld \

what happens if there are multiple binaries for different GPUs? will the 
linker-wrapper generates one fatbinary containing both elfs and embed the 
fatbinary as one image?



Comment at: clang/test/Driver/linker-wrapper.c:46
+// RUN:   
-fembed-offload-object=%S/Inputs/dummy-elf.o,cuda,nvptx64-nvida-cuda,sm_70 \
+// RUN:   
-fembed-offload-object=%S/Inputs/dummy-elf.o,cuda,nvptx64-nvida-cuda,sm_70
+// RUN: clang-linker-wrapper --dry-run -linker-path /usr/bin/ld -- %t.o -o 
a.out \

This option is the same as the preceding option. Is this intentional? Can we 
have a test that embeds multiple binaries for different GPUs?


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D123812/new/

https://reviews.llvm.org/D123812

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D123812: [CUDA] Add wrapper code generation for registering CUDA images

2022-04-19 Thread Joseph Huber via Phabricator via cfe-commits
jhuber6 updated this revision to Diff 423796.
jhuber6 added a comment.

Addings tests for wrapper codegen and fatbinary usage.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D123812/new/

https://reviews.llvm.org/D123812

Files:
  clang/test/Driver/linker-wrapper-image.c
  clang/test/Driver/linker-wrapper.c
  clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
  clang/tools/clang-linker-wrapper/OffloadWrapper.cpp

Index: clang/tools/clang-linker-wrapper/OffloadWrapper.cpp
===
--- clang/tools/clang-linker-wrapper/OffloadWrapper.cpp
+++ clang/tools/clang-linker-wrapper/OffloadWrapper.cpp
@@ -20,6 +20,8 @@
 using namespace llvm;
 
 namespace {
+/// Magic number that begins the section containing the CUDA fatbinary.
+constexpr unsigned CudaFatMagic = 0x466243b1;
 
 IntegerType *getSizeTTy(Module ) {
   LLVMContext  = M.getContext();
@@ -255,6 +257,265 @@
   appendToGlobalDtors(M, Func, /*Priority*/ 1);
 }
 
+// struct fatbin_wrapper {
+//  int32_t magic;
+//  int32_t version;
+//  void *image;
+//  void *reserved;
+//};
+StructType *getFatbinWrapperTy(Module ) {
+  LLVMContext  = M.getContext();
+  StructType *FatbinTy = StructType::getTypeByName(C, "fatbin_wrapper");
+  if (!FatbinTy)
+FatbinTy = StructType::create("fatbin_wrapper", Type::getInt32Ty(C),
+  Type::getInt32Ty(C), Type::getInt8PtrTy(C),
+  Type::getInt8PtrTy(C));
+  return FatbinTy;
+}
+
+/// Embed the image \p Image into the module \p M so it can be found by the
+/// runtime.
+GlobalVariable *createFatbinDesc(Module , ArrayRef Image) {
+  LLVMContext  = M.getContext();
+  llvm::Type *Int8PtrTy = Type::getInt8PtrTy(C);
+  llvm::Triple Triple = llvm::Triple(M.getTargetTriple());
+
+  // Create the global string containing the fatbinary.
+  StringRef FatbinConstantSection =
+  Triple.isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
+  auto *Data = ConstantDataArray::get(C, Image);
+  auto *Fatbin = new GlobalVariable(M, Data->getType(), /*isConstant*/ true,
+GlobalVariable::InternalLinkage, Data,
+".fatbin_image");
+  Fatbin->setSection(FatbinConstantSection);
+
+  // Create the fatbinary wrapper
+  StringRef FatbinWrapperSection =
+  Triple.isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
+  Constant *FatbinWrapper[] = {
+  ConstantInt::get(Type::getInt32Ty(C), CudaFatMagic),
+  ConstantInt::get(Type::getInt32Ty(C), 1),
+  ConstantExpr::getPointerBitCastOrAddrSpaceCast(Fatbin, Int8PtrTy),
+  ConstantPointerNull::get(Type::getInt8PtrTy(C))};
+
+  Constant *FatbinInitializer =
+  ConstantStruct::get(getFatbinWrapperTy(M), FatbinWrapper);
+
+  auto *FatbinDesc =
+  new GlobalVariable(M, getFatbinWrapperTy(M),
+ /*isConstant*/ true, GlobalValue::InternalLinkage,
+ FatbinInitializer, ".fatbin_wrapper");
+  FatbinDesc->setSection(FatbinWrapperSection);
+  FatbinDesc->setAlignment(Align(8));
+
+  // We create a dummy entry to ensure the linker will define the begin / end
+  // symbols. The CUDA runtime should ignore the null address if we attempt to
+  // register it.
+  auto *DummyInit =
+  ConstantAggregateZero::get(ArrayType::get(getEntryTy(M), 0u));
+  auto *DummyEntry = new GlobalVariable(
+  M, DummyInit->getType(), true, GlobalVariable::ExternalLinkage, DummyInit,
+  "__dummy.cuda_offloading.entry");
+  DummyEntry->setSection("cuda_offloading_entries");
+  DummyEntry->setVisibility(GlobalValue::HiddenVisibility);
+
+  return FatbinDesc;
+}
+
+/// Create the register globals function. We will iterate all of the offloading
+/// entries stored at the begin / end symbols and register them according to
+/// their type. This creates the following function in IR:
+///
+/// extern struct __tgt_offload_entry __start_cuda_offloading_entries;
+/// extern struct __tgt_offload_entry __stop_cuda_offloading_entries;
+///
+/// extern void __cudaRegisterFunction(void **, void *, void *, void *, int,
+///void *, void *, void *, void *, int *);
+/// extern void __cudaRegisterVar(void **, void *, void *, void *, int32_t,
+///   int64_t, int32_t, int32_t);
+///
+/// void __cudaRegisterTest(void **fatbinHandle) {
+///   for (struct __tgt_offload_entry *entry = &__start_cuda_offloading_entries;
+///entry != &__stop_cuda_offloading_entries; ++entry) {
+/// if (!entry->size)
+///   __cudaRegisterFunction(fatbinHandle, entry->addr, entry->name,
+///  entry->name, -1, 0, 0, 0, 0, 0);
+/// else
+///   __cudaRegisterVar(fatbinHandle, entry->addr, entry->name, entry->name,
+/// 0, entry->size, 0, 0);
+///   }
+/// }
+///
+/// TODO: This only registers functions are variables. Additional 

[PATCH] D123812: [CUDA] Add wrapper code generation for registering CUDA images

2022-04-19 Thread Joseph Huber via Phabricator via cfe-commits
jhuber6 added a comment.

In D123812#3459164 , @yaxunl wrote:

> need a CodeGenCUDA test for the registering. Also need a Driver test for the 
> subcommands.

Testing things inside the linker wrapper is a little hairy. I may need to add a 
special option for doing dry runs and printing the wrapping code to we can test 
this more satisfactorily. Doing that will create some more noise for review 
unfortunately. For OpenMP is simply ran all of our unit tests using the new 
driver and considered that sufficient evidence that it was working. Also which 
driver sub-commands should be tested?

In D123812#3459172 , @yaxunl wrote:

> Also, I am wondering whether we should document the new embedding scheme: 
> section names, symbol names, entries, etc, if it has not bee done.

There's some existing documentation 
 I wrote for OpenMP 
offloading once we started using this scheme. I was planning on updating it 
with this scheme for CUDA once it's landed, no sense writing documentation 
before it's final.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D123812/new/

https://reviews.llvm.org/D123812

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D123812: [CUDA] Add wrapper code generation for registering CUDA images

2022-04-19 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl added a comment.

Also, I am wondering whether we should document the new embedding scheme: 
section names, symbol names, entries, etc, if it has not bee done.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D123812/new/

https://reviews.llvm.org/D123812

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D123812: [CUDA] Add wrapper code generation for registering CUDA images

2022-04-19 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl added a comment.

need a CodeGenCUDA test for the registering. Also need a Driver test for the 
subcommands.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D123812/new/

https://reviews.llvm.org/D123812

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D123812: [CUDA] Add wrapper code generation for registering CUDA images

2022-04-14 Thread Joseph Huber via Phabricator via cfe-commits
jhuber6 added inline comments.



Comment at: clang/tools/clang-linker-wrapper/OffloadWrapper.cpp:351
+///   required for texture / surface / managed variables.
+Function *createRegisterGlobalsFunction(Module ) {
+  LLVMContext  = M.getContext();

tra wrote:
> jhuber6 wrote:
> > tra wrote:
> > > Do you think generation of the CUDA registration glue could be shared 
> > > with the front-end?
> > > 
> > I was thinking about it, but ultimately decided to keep the noise outside 
> > of the new driver to a minimum. Maybe if we move to the offloading entries 
> > being a common format we can easily share this code. Keeping it in Clang 
> > would have the advantage that it's easier to test directly and ensures we 
> > don't de-sync if anything changes. The only downside is that in the future 
> > I may want to push this functionality to a linker plugin or similar, which 
> > would require pulling it out of Clang again to prevent us needing to link 
> > in Clang to build LLVM.
> > 
> > Also needing to do this all through the builder API isn't ideal, it would 
> > be nice if we had some kind of runtime to call to do this for us, but I 
> > didn't feel like adding yet another shared library for CUDA. I considered 
> > putting it inside the cuda header wrappers as well, but forcing every CUDA 
> > file to have some externally visible weak registration blob didn't sit well 
> > with me.
> Perhaps front-end is not the right place for it, indeed. LLVM itself may be a 
> better choice. We already have some things there for somewhat similar 
> purposes (like lib/WindowsManifest) so adding a helper function to generate 
> runtime glue for CUDA should not be unreasonable.
I think it's fine here for this patch, but I definitely want to move it into 
LLVM in the future once I start generalizing more of this stuff.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D123812/new/

https://reviews.llvm.org/D123812

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D123812: [CUDA] Add wrapper code generation for registering CUDA images

2022-04-14 Thread Artem Belevich via Phabricator via cfe-commits
tra added inline comments.



Comment at: clang/tools/clang-linker-wrapper/OffloadWrapper.cpp:351
+///   required for texture / surface / managed variables.
+Function *createRegisterGlobalsFunction(Module ) {
+  LLVMContext  = M.getContext();

jhuber6 wrote:
> tra wrote:
> > Do you think generation of the CUDA registration glue could be shared with 
> > the front-end?
> > 
> I was thinking about it, but ultimately decided to keep the noise outside of 
> the new driver to a minimum. Maybe if we move to the offloading entries being 
> a common format we can easily share this code. Keeping it in Clang would have 
> the advantage that it's easier to test directly and ensures we don't de-sync 
> if anything changes. The only downside is that in the future I may want to 
> push this functionality to a linker plugin or similar, which would require 
> pulling it out of Clang again to prevent us needing to link in Clang to build 
> LLVM.
> 
> Also needing to do this all through the builder API isn't ideal, it would be 
> nice if we had some kind of runtime to call to do this for us, but I didn't 
> feel like adding yet another shared library for CUDA. I considered putting it 
> inside the cuda header wrappers as well, but forcing every CUDA file to have 
> some externally visible weak registration blob didn't sit well with me.
Perhaps front-end is not the right place for it, indeed. LLVM itself may be a 
better choice. We already have some things there for somewhat similar purposes 
(like lib/WindowsManifest) so adding a helper function to generate runtime glue 
for CUDA should not be unreasonable.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D123812/new/

https://reviews.llvm.org/D123812

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D123812: [CUDA] Add wrapper code generation for registering CUDA images

2022-04-14 Thread Joseph Huber via Phabricator via cfe-commits
jhuber6 added inline comments.



Comment at: clang/tools/clang-linker-wrapper/OffloadWrapper.cpp:351
+///   required for texture / surface / managed variables.
+Function *createRegisterGlobalsFunction(Module ) {
+  LLVMContext  = M.getContext();

tra wrote:
> Do you think generation of the CUDA registration glue could be shared with 
> the front-end?
> 
I was thinking about it, but ultimately decided to keep the noise outside of 
the new driver to a minimum. Maybe if we move to the offloading entries being a 
common format we can easily share this code. Keeping it in Clang would have the 
advantage that it's easier to test directly and ensures we don't de-sync if 
anything changes. The only downside is that in the future I may want to push 
this functionality to a linker plugin or similar, which would require pulling 
it out of Clang again to prevent us needing to link in Clang to build LLVM.

Also needing to do this all through the builder API isn't ideal, it would be 
nice if we had some kind of runtime to call to do this for us, but I didn't 
feel like adding yet another shared library for CUDA. I considered putting it 
inside the cuda header wrappers as well, but forcing every CUDA file to have 
some externally visible weak registration blob didn't sit well with me.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D123812/new/

https://reviews.llvm.org/D123812

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D123812: [CUDA] Add wrapper code generation for registering CUDA images

2022-04-14 Thread Artem Belevich via Phabricator via cfe-commits
tra added inline comments.



Comment at: clang/tools/clang-linker-wrapper/OffloadWrapper.cpp:351
+///   required for texture / surface / managed variables.
+Function *createRegisterGlobalsFunction(Module ) {
+  LLVMContext  = M.getContext();

Do you think generation of the CUDA registration glue could be shared with the 
front-end?



Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D123812/new/

https://reviews.llvm.org/D123812

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D123812: [CUDA] Add wrapper code generation for registering CUDA images

2022-04-14 Thread Joseph Huber via Phabricator via cfe-commits
jhuber6 added a comment.

For reference, this is how I used this driver to offload a CUDA and OpenMP 
offload kernel that both called an external "hello world" function in 
`device.cu`.

  $ clang++ hello.cu device.cu -foffload-new-driver --offload-arch=sm_70 -c
  $ clang++ openmp.cpp -fopenmp-new-driver -fopenmp -fopenmp-targets=nvptx64 
-Xopenmp-target=nvptx64 -march=sm_70 -fopenmp-offload-mandatory -c
  $ clang++ hello.o device.o openmp.o -foffload-new-driver -fopenmp 
-fopenmp-targets=nvptx64


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D123812/new/

https://reviews.llvm.org/D123812

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D123812: [CUDA] Add wrapper code generation for registering CUDA images

2022-04-14 Thread Joseph Huber via Phabricator via cfe-commits
jhuber6 created this revision.
jhuber6 added reviewers: jdoerfert, JonChesterfield, tra, yaxunl.
Herald added a subscriber: carlosgalvezp.
Herald added a project: All.
jhuber6 requested review of this revision.
Herald added a project: clang.
Herald added a subscriber: cfe-commits.

This patch adds the necessary code generation to create the wrapper code
that registers all the globals in CUDA. We create the necessary
functions and iterate through the list of
`__start_cuda_offloading_entries` to find which globals must be
registered. This is very similar to the code generation done currently
in Clang for non-rdc builds, but here we are registering a fully linked
fatbinary and finding the globals via the above sections.

With this we should be able to fully support basic RDC / LTO building of CUDA
code.

It's also worth noting that this does not include the necessary PTX to JIT the
image, so to use this support the offloading architecture must match the
system's architecture.

Depends on D123810 


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D123812

Files:
  clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
  clang/tools/clang-linker-wrapper/OffloadWrapper.cpp

Index: clang/tools/clang-linker-wrapper/OffloadWrapper.cpp
===
--- clang/tools/clang-linker-wrapper/OffloadWrapper.cpp
+++ clang/tools/clang-linker-wrapper/OffloadWrapper.cpp
@@ -20,6 +20,8 @@
 using namespace llvm;
 
 namespace {
+/// Magic number that begins the section containing the CUDA fatbinary.
+constexpr unsigned CudaFatMagic = 0x466243b1;
 
 IntegerType *getSizeTTy(Module ) {
   LLVMContext  = M.getContext();
@@ -255,6 +257,265 @@
   appendToGlobalDtors(M, Func, /*Priority*/ 1);
 }
 
+// struct fatbin_wrapper {
+//  int32_t magic;
+//  int32_t version;
+//  void *image;
+//  void *reserved;
+//};
+StructType *getFatbinWrapperTy(Module ) {
+  LLVMContext  = M.getContext();
+  StructType *FatbinTy = StructType::getTypeByName(C, "fatbin_wrapper");
+  if (!FatbinTy)
+FatbinTy = StructType::create("fatbin_wrapper", Type::getInt32Ty(C),
+  Type::getInt32Ty(C), Type::getInt8PtrTy(C),
+  Type::getInt8PtrTy(C));
+  return FatbinTy;
+}
+
+/// Embed the image \p Image into the module \p M so it can be found by the
+/// runtime.
+GlobalVariable *createFatbinDesc(Module , ArrayRef Image) {
+  LLVMContext  = M.getContext();
+  llvm::Type *Int8PtrTy = Type::getInt8PtrTy(C);
+  llvm::Triple Triple = llvm::Triple(M.getTargetTriple());
+
+  // Create the global string containing the fatbinary.
+  StringRef FatbinConstantSection =
+  Triple.isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
+  auto *Data = ConstantDataArray::get(C, Image);
+  auto *Fatbin = new GlobalVariable(M, Data->getType(), /*isConstant*/ true,
+GlobalVariable::InternalLinkage, Data,
+".fatbin_image");
+  Fatbin->setSection(FatbinConstantSection);
+
+  // Create the fatbinary wrapper
+  StringRef FatbinWrapperSection =
+  Triple.isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
+  Constant *FatbinWrapper[] = {
+  ConstantInt::get(Type::getInt32Ty(C), CudaFatMagic),
+  ConstantInt::get(Type::getInt32Ty(C), 1),
+  ConstantExpr::getPointerBitCastOrAddrSpaceCast(Fatbin, Int8PtrTy),
+  ConstantPointerNull::get(Type::getInt8PtrTy(C))};
+
+  Constant *FatbinInitializer =
+  ConstantStruct::get(getFatbinWrapperTy(M), FatbinWrapper);
+
+  auto *FatbinDesc =
+  new GlobalVariable(M, getFatbinWrapperTy(M),
+ /*isConstant*/ true, GlobalValue::InternalLinkage,
+ FatbinInitializer, ".fatbin_wrapper");
+  FatbinDesc->setSection(FatbinWrapperSection);
+  FatbinDesc->setAlignment(Align(8));
+
+  // We create a dummy entry to ensure the linker will define the begin / end
+  // symbols. The CUDA runtime should ignore the null address if we attempt to
+  // register it.
+  auto *DummyInit =
+  ConstantAggregateZero::get(ArrayType::get(getEntryTy(M), 0u));
+  auto *DummyEntry = new GlobalVariable(
+  M, DummyInit->getType(), true, GlobalVariable::ExternalLinkage, DummyInit,
+  "__dummy.cuda_offloading.entry");
+  DummyEntry->setSection("cuda_offloading_entries");
+  DummyEntry->setVisibility(GlobalValue::HiddenVisibility);
+
+  return FatbinDesc;
+}
+
+/// Create the register globals function. We will iterate all of the offloading
+/// entries stored at the begin / end symbols and register them according to
+/// their type. This creates the following function in IR:
+///
+/// extern struct __tgt_offload_entry __start_cuda_offloading_entries;
+/// extern struct __tgt_offload_entry __stop_cuda_offloading_entries;
+///
+/// extern void __cudaRegisterFunction(void **, void *, void *, void *, int,
+///void *, void *,