[PATCH] D139730: [OpenMP][DeviceRTL][AMDGPU] Support code object version 5

2023-08-29 Thread Saiyedul Islam via Phabricator via cfe-commits
saiislam added inline comments.



Comment at: clang/test/CodeGenCUDA/amdgpu-code-object-version-linking.cu:12
+// RUN: llvm-link %t_0 %t_5 -o -| llvm-dis -o - | FileCheck 
-check-prefix=LINKED5 %s
+
+#include "Inputs/cuda.h"

yaxunl wrote:
> saiislam wrote:
> > yaxunl wrote:
> > > need to test using clang -cc1 with -O3 and -mlink-builtin-bitcode to link 
> > > the device lib and verify the load of llvm.amdgcn.abi.version being 
> > > eliminated after optimization.
> > > 
> > > I think currently it cannot do that since llvm.amdgcn.abi.version is not 
> > > internalized by the internalization pass. This can cause some significant 
> > > perf drops since loading is expensive. Need to tweak the function 
> > > controlling what variables can be internalized for amdgpu so that this 
> > > variable gets internalized, or having a generic way to tell that function 
> > > which variables should be internalized, e.g. by adding a metadata 
> > > amdgcn.internalize
> > load of llvm.amdgcn.abi.version is being eliminated with cc1, -O3, and 
> > mlink-builtin-bitcode of device lib.
> It seems being eliminated by IPSCCP. It makes sense since it is constant 
> weak_odr without externally_initialized. Either changing it to weak or adding 
> externally_initialized will keep the load. Normal `__constant__` var in 
> device code may be changed by host code, therefore they are emitted with 
> externally_initialized and do not have the load eliminated.
Thank you @yaxunl !
I have added these observations as comments in the code at load emit and global 
emit locations.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D139730/new/

https://reviews.llvm.org/D139730

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D139730: [OpenMP][DeviceRTL][AMDGPU] Support code object version 5

2023-08-29 Thread Saiyedul Islam via Phabricator via cfe-commits
This revision was landed with ongoing or failed builds.
This revision was automatically updated to reflect the committed changes.
Closed by commit rGf616c3eeb43f: [OpenMP][DeviceRTL][AMDGPU] Support code 
object version 5 (authored by saiislam).

Changed prior to commit:
  https://reviews.llvm.org/D139730?vs=553991&id=554262#toc

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D139730/new/

https://reviews.llvm.org/D139730

Files:
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/lib/CodeGen/CodeGenModule.cpp
  clang/lib/CodeGen/CodeGenModule.h
  clang/lib/CodeGen/TargetInfo.h
  clang/lib/CodeGen/Targets/AMDGPU.cpp
  clang/lib/Driver/ToolChain.cpp
  clang/lib/Driver/ToolChains/Clang.cpp
  clang/test/CodeGenCUDA/amdgpu-code-object-version-linking.cu
  clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu
  clang/test/CodeGenOpenCL/opencl_types.cl
  clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
  openmp/libomptarget/DeviceRTL/CMakeLists.txt
  openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
  openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h

Index: openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
===
--- openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
+++ openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
@@ -25,6 +25,7 @@
 #include "llvm/Support/MemoryBufferRef.h"
 
 #include "llvm/Support/YAMLTraits.h"
+using namespace llvm::ELF;
 
 namespace llvm {
 namespace omp {
@@ -32,19 +33,29 @@
 namespace plugin {
 namespace utils {
 
-// The implicit arguments of AMDGPU kernels.
+// The implicit arguments of COV5 AMDGPU kernels.
 struct AMDGPUImplicitArgsTy {
-  uint64_t OffsetX;
-  uint64_t OffsetY;
-  uint64_t OffsetZ;
-  uint64_t HostcallPtr;
-  uint64_t Unused0;
-  uint64_t Unused1;
-  uint64_t Unused2;
+  uint32_t BlockCountX;
+  uint32_t BlockCountY;
+  uint32_t BlockCountZ;
+  uint16_t GroupSizeX;
+  uint16_t GroupSizeY;
+  uint16_t GroupSizeZ;
+  uint8_t Unused0[46]; // 46 byte offset.
+  uint16_t GridDims;
+  uint8_t Unused1[190]; // 190 byte offset.
 };
 
-static_assert(sizeof(AMDGPUImplicitArgsTy) == 56,
-  "Unexpected size of implicit arguments");
+// Dummy struct for COV4 implicitargs.
+struct AMDGPUImplicitArgsTyCOV4 {
+  uint8_t Unused[56];
+};
+
+uint32_t getImplicitArgsSize(uint16_t Version) {
+  return Version < ELF::ELFABIVERSION_AMDGPU_HSA_V5
+ ? sizeof(AMDGPUImplicitArgsTyCOV4)
+ : sizeof(AMDGPUImplicitArgsTy);
+}
 
 /// Parse a TargetID to get processor arch and feature map.
 /// Returns processor subarch.
@@ -295,7 +306,8 @@
 /// Reads the AMDGPU specific metadata from the ELF file and propagates the
 /// KernelInfoMap
 Error readAMDGPUMetaDataFromImage(MemoryBufferRef MemBuffer,
-  StringMap &KernelInfoMap) {
+  StringMap &KernelInfoMap,
+  uint16_t &ELFABIVersion) {
   Error Err = Error::success(); // Used later as out-parameter
 
   auto ELFOrError = object::ELF64LEFile::create(MemBuffer.getBuffer());
@@ -305,6 +317,12 @@
   const object::ELF64LEFile ELFObj = ELFOrError.get();
   ArrayRef Sections = cantFail(ELFObj.sections());
   KernelInfoReader Reader(KernelInfoMap);
+
+  // Read the code object version from ELF image header
+  auto Header = ELFObj.getHeader();
+  ELFABIVersion = (uint8_t)(Header.e_ident[ELF::EI_ABIVERSION]);
+  DP("ELFABIVERSION Version: %u\n", ELFABIVersion);
+
   for (const auto &S : Sections) {
 if (S.sh_type != ELF::SHT_NOTE)
   continue;
Index: openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
===
--- openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -381,6 +381,9 @@
   /// Get the executable.
   hsa_executable_t getExecutable() const { return Executable; }
 
+  /// Get to Code Object Version of the ELF
+  uint16_t getELFABIVersion() const { return ELFABIVersion; }
+
   /// Find an HSA device symbol by its name on the executable.
   Expected
   findDeviceSymbol(GenericDeviceTy &Device, StringRef SymbolName) const;
@@ -401,6 +404,7 @@
   hsa_executable_t Executable;
   hsa_code_object_t CodeObject;
   StringMap KernelInfoMap;
+  uint16_t ELFABIVersion;
 };
 
 /// Class implementing the AMDGPU kernel functionalities which derives from the
@@ -408,8 +412,7 @@
 struct AMDGPUKernelTy : public GenericKernelTy {
   /// Create an AMDGPU kernel with a name and an execution mode.
   AMDGPUKernelTy(const char *Name, OMPTgtExecModeFlags ExecutionMode)
-  : GenericKernelTy(Name, ExecutionMode),
-ImplicitArgsSize(sizeof(utils::AMDGPUImplicitArgsTy)) {}
+  : GenericKernelTy(Name, ExecutionMode) {}
 
   /// Initialize the AMDGPU kernel.
   Error initImpl(GenericDeviceTy &Device, DeviceImageTy &Image) override {
@@ -450,6 +453,

[PATCH] D139730: [OpenMP][DeviceRTL][AMDGPU] Support code object version 5

2023-08-28 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl accepted this revision.
yaxunl added a comment.

LGTM. Thanks




Comment at: clang/test/CodeGenCUDA/amdgpu-code-object-version-linking.cu:12
+// RUN: llvm-link %t_0 %t_5 -o -| llvm-dis -o - | FileCheck 
-check-prefix=LINKED5 %s
+
+#include "Inputs/cuda.h"

saiislam wrote:
> yaxunl wrote:
> > need to test using clang -cc1 with -O3 and -mlink-builtin-bitcode to link 
> > the device lib and verify the load of llvm.amdgcn.abi.version being 
> > eliminated after optimization.
> > 
> > I think currently it cannot do that since llvm.amdgcn.abi.version is not 
> > internalized by the internalization pass. This can cause some significant 
> > perf drops since loading is expensive. Need to tweak the function 
> > controlling what variables can be internalized for amdgpu so that this 
> > variable gets internalized, or having a generic way to tell that function 
> > which variables should be internalized, e.g. by adding a metadata 
> > amdgcn.internalize
> load of llvm.amdgcn.abi.version is being eliminated with cc1, -O3, and 
> mlink-builtin-bitcode of device lib.
It seems being eliminated by IPSCCP. It makes sense since it is constant 
weak_odr without externally_initialized. Either changing it to weak or adding 
externally_initialized will keep the load. Normal `__constant__` var in device 
code may be changed by host code, therefore they are emitted with 
externally_initialized and do not have the load eliminated.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D139730/new/

https://reviews.llvm.org/D139730

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D139730: [OpenMP][DeviceRTL][AMDGPU] Support code object version 5

2023-08-28 Thread Joseph Huber via Phabricator via cfe-commits
jhuber6 accepted this revision.
jhuber6 added a comment.
This revision is now accepted and ready to land.

I think it's fine now given that it's passing tests. Others feel free to 
comment.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D139730/new/

https://reviews.llvm.org/D139730

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D139730: [OpenMP][DeviceRTL][AMDGPU] Support code object version 5

2023-08-28 Thread Saiyedul Islam via Phabricator via cfe-commits
saiislam updated this revision to Diff 553991.
saiislam marked 3 inline comments as done.
saiislam added a comment.

Minor fixes addressing reviewer's comment.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D139730/new/

https://reviews.llvm.org/D139730

Files:
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/lib/CodeGen/CodeGenModule.cpp
  clang/lib/CodeGen/CodeGenModule.h
  clang/lib/CodeGen/TargetInfo.h
  clang/lib/CodeGen/Targets/AMDGPU.cpp
  clang/lib/Driver/ToolChain.cpp
  clang/lib/Driver/ToolChains/Clang.cpp
  clang/test/CodeGenCUDA/amdgpu-code-object-version-linking.cu
  clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu
  clang/test/CodeGenOpenCL/opencl_types.cl
  clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
  openmp/libomptarget/DeviceRTL/CMakeLists.txt
  openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
  openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h

Index: openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
===
--- openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
+++ openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
@@ -25,6 +25,7 @@
 #include "llvm/Support/MemoryBufferRef.h"
 
 #include "llvm/Support/YAMLTraits.h"
+using namespace llvm::ELF;
 
 namespace llvm {
 namespace omp {
@@ -32,19 +33,29 @@
 namespace plugin {
 namespace utils {
 
-// The implicit arguments of AMDGPU kernels.
+// The implicit arguments of COV5 AMDGPU kernels.
 struct AMDGPUImplicitArgsTy {
-  uint64_t OffsetX;
-  uint64_t OffsetY;
-  uint64_t OffsetZ;
-  uint64_t HostcallPtr;
-  uint64_t Unused0;
-  uint64_t Unused1;
-  uint64_t Unused2;
+  uint32_t BlockCountX;
+  uint32_t BlockCountY;
+  uint32_t BlockCountZ;
+  uint16_t GroupSizeX;
+  uint16_t GroupSizeY;
+  uint16_t GroupSizeZ;
+  uint8_t Unused0[46]; // 46 byte offset.
+  uint16_t GridDims;
+  uint8_t Unused1[190]; // 190 byte offset.
 };
 
-static_assert(sizeof(AMDGPUImplicitArgsTy) == 56,
-  "Unexpected size of implicit arguments");
+// Dummy struct for COV4 implicitargs.
+struct AMDGPUImplicitArgsTyCOV4 {
+  uint8_t Unused[56];
+};
+
+uint32_t getImplicitArgsSize(uint16_t Version) {
+  return Version < ELF::ELFABIVERSION_AMDGPU_HSA_V5
+ ? sizeof(AMDGPUImplicitArgsTyCOV4)
+ : sizeof(AMDGPUImplicitArgsTy);
+}
 
 /// Parse a TargetID to get processor arch and feature map.
 /// Returns processor subarch.
@@ -295,7 +306,8 @@
 /// Reads the AMDGPU specific metadata from the ELF file and propagates the
 /// KernelInfoMap
 Error readAMDGPUMetaDataFromImage(MemoryBufferRef MemBuffer,
-  StringMap &KernelInfoMap) {
+  StringMap &KernelInfoMap,
+  uint16_t &ELFABIVersion) {
   Error Err = Error::success(); // Used later as out-parameter
 
   auto ELFOrError = object::ELF64LEFile::create(MemBuffer.getBuffer());
@@ -305,6 +317,12 @@
   const object::ELF64LEFile ELFObj = ELFOrError.get();
   ArrayRef Sections = cantFail(ELFObj.sections());
   KernelInfoReader Reader(KernelInfoMap);
+
+  // Read the code object version from ELF image header
+  auto Header = ELFObj.getHeader();
+  ELFABIVersion = (uint8_t)(Header.e_ident[ELF::EI_ABIVERSION]);
+  DP("ELFABIVERSION Version: %u\n", ELFABIVersion);
+
   for (const auto &S : Sections) {
 if (S.sh_type != ELF::SHT_NOTE)
   continue;
Index: openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
===
--- openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -381,6 +381,9 @@
   /// Get the executable.
   hsa_executable_t getExecutable() const { return Executable; }
 
+  /// Get to Code Object Version of the ELF
+  uint16_t getELFABIVersion() const { return ELFABIVersion; }
+
   /// Find an HSA device symbol by its name on the executable.
   Expected
   findDeviceSymbol(GenericDeviceTy &Device, StringRef SymbolName) const;
@@ -401,6 +404,7 @@
   hsa_executable_t Executable;
   hsa_code_object_t CodeObject;
   StringMap KernelInfoMap;
+  uint16_t ELFABIVersion;
 };
 
 /// Class implementing the AMDGPU kernel functionalities which derives from the
@@ -408,8 +412,7 @@
 struct AMDGPUKernelTy : public GenericKernelTy {
   /// Create an AMDGPU kernel with a name and an execution mode.
   AMDGPUKernelTy(const char *Name, OMPTgtExecModeFlags ExecutionMode)
-  : GenericKernelTy(Name, ExecutionMode),
-ImplicitArgsSize(sizeof(utils::AMDGPUImplicitArgsTy)) {}
+  : GenericKernelTy(Name, ExecutionMode) {}
 
   /// Initialize the AMDGPU kernel.
   Error initImpl(GenericDeviceTy &Device, DeviceImageTy &Image) override {
@@ -450,6 +453,9 @@
 // TODO: Read the kernel descriptor for the max threads per block. May be
 // read from the image.
 
+ImplicitArgsSize = utils::getImplicitArgsSize(AMDI

[PATCH] D139730: [OpenMP][DeviceRTL][AMDGPU] Support code object version 5

2023-08-28 Thread Joseph Huber via Phabricator via cfe-commits
jhuber6 added a comment.

Just a few more nits. I think it's looking fine but I haven't tested it. Anyone 
else?




Comment at: clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp:406
 
+  // pass on -mllvm options to the clang
+  for (const opt::Arg *Arg : Args.filtered(OPT_mllvm)) {





Comment at: clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp:415-417
+  if (SaveTemps) {
 CmdArgs.push_back("-save-temps");
+  }

No braces around a single line if.



Comment at: openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h:54
+
+uint16_t getImplicitArgsSize(uint16_t Version) {
+  return Version < ELF::ELFABIVERSION_AMDGPU_HSA_V5

We return uint16_t here? These are sizes.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D139730/new/

https://reviews.llvm.org/D139730

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D139730: [OpenMP][DeviceRTL][AMDGPU] Support code object version 5

2023-08-25 Thread Saiyedul Islam via Phabricator via cfe-commits
saiislam updated this revision to Diff 553413.
saiislam marked 2 inline comments as done.
saiislam added a comment.

Changed getImplicitArgsSize to use sizeof.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D139730/new/

https://reviews.llvm.org/D139730

Files:
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/lib/CodeGen/CodeGenModule.cpp
  clang/lib/CodeGen/CodeGenModule.h
  clang/lib/CodeGen/TargetInfo.h
  clang/lib/CodeGen/Targets/AMDGPU.cpp
  clang/lib/Driver/ToolChain.cpp
  clang/lib/Driver/ToolChains/Clang.cpp
  clang/test/CodeGenCUDA/amdgpu-code-object-version-linking.cu
  clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu
  clang/test/CodeGenOpenCL/opencl_types.cl
  clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
  openmp/libomptarget/DeviceRTL/CMakeLists.txt
  openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
  openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h

Index: openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
===
--- openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
+++ openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
@@ -25,6 +25,7 @@
 #include "llvm/Support/MemoryBufferRef.h"
 
 #include "llvm/Support/YAMLTraits.h"
+using namespace llvm::ELF;
 
 namespace llvm {
 namespace omp {
@@ -32,19 +33,29 @@
 namespace plugin {
 namespace utils {
 
-// The implicit arguments of AMDGPU kernels.
+// The implicit arguments of COV5 AMDGPU kernels.
 struct AMDGPUImplicitArgsTy {
-  uint64_t OffsetX;
-  uint64_t OffsetY;
-  uint64_t OffsetZ;
-  uint64_t HostcallPtr;
-  uint64_t Unused0;
-  uint64_t Unused1;
-  uint64_t Unused2;
+  uint32_t BlockCountX;
+  uint32_t BlockCountY;
+  uint32_t BlockCountZ;
+  uint16_t GroupSizeX;
+  uint16_t GroupSizeY;
+  uint16_t GroupSizeZ;
+  uint8_t Unused0[46]; // 46 byte offset.
+  uint16_t GridDims;
+  uint8_t Unused1[190]; // 190 byte offset.
 };
 
-static_assert(sizeof(AMDGPUImplicitArgsTy) == 56,
-  "Unexpected size of implicit arguments");
+// Dummy struct for COV4 implicitargs.
+struct AMDGPUImplicitArgsTyCOV4 {
+  uint8_t Unused[56];
+};
+
+uint16_t getImplicitArgsSize(uint16_t Version) {
+  return Version < ELF::ELFABIVERSION_AMDGPU_HSA_V5
+ ? sizeof(AMDGPUImplicitArgsTyCOV4)
+ : sizeof(AMDGPUImplicitArgsTy);
+}
 
 /// Parse a TargetID to get processor arch and feature map.
 /// Returns processor subarch.
@@ -295,7 +306,8 @@
 /// Reads the AMDGPU specific metadata from the ELF file and propagates the
 /// KernelInfoMap
 Error readAMDGPUMetaDataFromImage(MemoryBufferRef MemBuffer,
-  StringMap &KernelInfoMap) {
+  StringMap &KernelInfoMap,
+  uint16_t &ELFABIVersion) {
   Error Err = Error::success(); // Used later as out-parameter
 
   auto ELFOrError = object::ELF64LEFile::create(MemBuffer.getBuffer());
@@ -305,6 +317,12 @@
   const object::ELF64LEFile ELFObj = ELFOrError.get();
   ArrayRef Sections = cantFail(ELFObj.sections());
   KernelInfoReader Reader(KernelInfoMap);
+
+  // Read the code object version from ELF image header
+  auto Header = ELFObj.getHeader();
+  ELFABIVersion = (uint8_t)(Header.e_ident[ELF::EI_ABIVERSION]);
+  DP("ELFABIVERSION Version: %u\n", ELFABIVersion);
+
   for (const auto &S : Sections) {
 if (S.sh_type != ELF::SHT_NOTE)
   continue;
Index: openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
===
--- openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -381,6 +381,9 @@
   /// Get the executable.
   hsa_executable_t getExecutable() const { return Executable; }
 
+  /// Get to Code Object Version of the ELF
+  uint16_t getELFABIVersion() const { return ELFABIVersion; }
+
   /// Find an HSA device symbol by its name on the executable.
   Expected
   findDeviceSymbol(GenericDeviceTy &Device, StringRef SymbolName) const;
@@ -401,6 +404,7 @@
   hsa_executable_t Executable;
   hsa_code_object_t CodeObject;
   StringMap KernelInfoMap;
+  uint16_t ELFABIVersion;
 };
 
 /// Class implementing the AMDGPU kernel functionalities which derives from the
@@ -408,8 +412,7 @@
 struct AMDGPUKernelTy : public GenericKernelTy {
   /// Create an AMDGPU kernel with a name and an execution mode.
   AMDGPUKernelTy(const char *Name, OMPTgtExecModeFlags ExecutionMode)
-  : GenericKernelTy(Name, ExecutionMode),
-ImplicitArgsSize(sizeof(utils::AMDGPUImplicitArgsTy)) {}
+  : GenericKernelTy(Name, ExecutionMode) {}
 
   /// Initialize the AMDGPU kernel.
   Error initImpl(GenericDeviceTy &Device, DeviceImageTy &Image) override {
@@ -450,6 +453,9 @@
 // TODO: Read the kernel descriptor for the max threads per block. May be
 // read from the image.
 
+ImplicitArgsSize = utils::getImplicitArgsSize(AMDI

[PATCH] D139730: [OpenMP][DeviceRTL][AMDGPU] Support code object version 5

2023-08-24 Thread Joseph Huber via Phabricator via cfe-commits
jhuber6 added inline comments.



Comment at: openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h:49
 
-static_assert(sizeof(AMDGPUImplicitArgsTy) == 56,
-  "Unexpected size of implicit arguments");
+enum IMPLICITARGS : uint32_t {
+  COV4_SIZE = 56,

saiislam wrote:
> jhuber6 wrote:
> > We should probably be using `sizeof` now that it's back to being a struct 
> > and keep the old struct definition.
> AMDGPU plugin doesn't use any implicitarg for COV4, but it does so for COV5. 
> So, we are not keeping two separate structures for implicitargs of COV4 and 
> COV5.
> If we use sizeof then it will always return 256 corresponding to COV5 (even 
> for cov4, which should be 56). That's why we need this function.
Yeah, I guess for COV4 the only thing that mattered was the size so that we 
could make sure it's all set to zero. We shouldn't use the enum value. It 
should be `sizeof(ImplicitArgsTy)` for `COV5` and either hard-code it in the 
function for V4 or make a dummy struct.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D139730/new/

https://reviews.llvm.org/D139730

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D139730: [OpenMP][DeviceRTL][AMDGPU] Support code object version 5

2023-08-24 Thread Saiyedul Islam via Phabricator via cfe-commits
saiislam marked an inline comment as done.
saiislam added inline comments.



Comment at: openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h:49
 
-static_assert(sizeof(AMDGPUImplicitArgsTy) == 56,
-  "Unexpected size of implicit arguments");
+enum IMPLICITARGS : uint32_t {
+  COV4_SIZE = 56,

jhuber6 wrote:
> We should probably be using `sizeof` now that it's back to being a struct and 
> keep the old struct definition.
AMDGPU plugin doesn't use any implicitarg for COV4, but it does so for COV5. 
So, we are not keeping two separate structures for implicitargs of COV4 and 
COV5.
If we use sizeof then it will always return 256 corresponding to COV5 (even for 
cov4, which should be 56). That's why we need this function.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D139730/new/

https://reviews.llvm.org/D139730

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D139730: [OpenMP][DeviceRTL][AMDGPU] Support code object version 5

2023-08-24 Thread Joseph Huber via Phabricator via cfe-commits
jhuber6 added inline comments.



Comment at: openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h:49
 
-static_assert(sizeof(AMDGPUImplicitArgsTy) == 56,
-  "Unexpected size of implicit arguments");
+enum IMPLICITARGS : uint32_t {
+  COV4_SIZE = 56,

We should probably be using `sizeof` now that it's back to being a struct and 
keep the old struct definition.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D139730/new/

https://reviews.llvm.org/D139730

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D139730: [OpenMP][DeviceRTL][AMDGPU] Support code object version 5

2023-08-24 Thread Saiyedul Islam via Phabricator via cfe-commits
saiislam added inline comments.



Comment at: clang/lib/Driver/ToolChains/Clang.cpp:8649-8650
 
+  // code-object-version=X needs to be passed to clang-linker-wrapper to ensure
+  // that it is used by lld.
+  if (const Arg *A = Args.getLastArg(options::OPT_mcode_object_version_EQ)) {

arsenm wrote:
> so device rtl is linked once as a normal library?
No, this is command generation for clang-linker-wrapper. Since, devicertl is 
compiled only to get bitcode file (-c), it is never called.



Comment at: clang/lib/Driver/ToolChains/Clang.cpp:8653-8654
+CmdArgs.push_back(Args.MakeArgString("-mllvm"));
+CmdArgs.push_back(Args.MakeArgString(
+Twine("--amdhsa-code-object-version=") + A->getValue()));
+  }

arsenm wrote:
> Why do you need this? The code object version is supposed to come from a 
> module flag. We should be getting rid of the command line argument for it
During command generation for clang-linker-wrapper, it is required to check 
user's provided `mcode-object-version=X` so that `amdhsa-code-object-version=X` 
can be passed to the clang/lto backend.

`getAmdhsaCodeObjectVersion()` and `getHsaAbiVersion()` both still use the 
above command line argument to override user's choice of COV, instead of the 
module flag.



Comment at: clang/test/CodeGenCUDA/amdgpu-code-object-version-linking.cu:12
+// RUN: llvm-link %t_0 %t_5 -o -| llvm-dis -o - | FileCheck 
-check-prefix=LINKED5 %s
+
+#include "Inputs/cuda.h"

yaxunl wrote:
> need to test using clang -cc1 with -O3 and -mlink-builtin-bitcode to link the 
> device lib and verify the load of llvm.amdgcn.abi.version being eliminated 
> after optimization.
> 
> I think currently it cannot do that since llvm.amdgcn.abi.version is not 
> internalized by the internalization pass. This can cause some significant 
> perf drops since loading is expensive. Need to tweak the function controlling 
> what variables can be internalized for amdgpu so that this variable gets 
> internalized, or having a generic way to tell that function which variables 
> should be internalized, e.g. by adding a metadata amdgcn.internalize
load of llvm.amdgcn.abi.version is being eliminated with cc1, -O3, and 
mlink-builtin-bitcode of device lib.



Comment at: clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp:406-410
+  // pass on -mllvm options to the clang
+  for (const opt::Arg *Arg : Args.filtered(OPT_mllvm)) {
+CmdArgs.push_back("-mllvm");
+CmdArgs.push_back(Arg->getValue());
+  }

arsenm wrote:
> Shouldn't need this?
It is required so that when clang pass (not the lto backend) is called from 
clang-linker-wrapper due to `-save-temps`, user provided COV is correctly 
propagated.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D139730/new/

https://reviews.llvm.org/D139730

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D139730: [OpenMP][DeviceRTL][AMDGPU] Support code object version 5

2023-08-24 Thread Saiyedul Islam via Phabricator via cfe-commits
saiislam updated this revision to Diff 553179.
saiislam marked 7 inline comments as done.
saiislam added a comment.

Updated test case to check internalization of newly inserted global variable.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D139730/new/

https://reviews.llvm.org/D139730

Files:
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/lib/CodeGen/CodeGenModule.cpp
  clang/lib/CodeGen/CodeGenModule.h
  clang/lib/CodeGen/TargetInfo.h
  clang/lib/CodeGen/Targets/AMDGPU.cpp
  clang/lib/Driver/ToolChain.cpp
  clang/lib/Driver/ToolChains/Clang.cpp
  clang/test/CodeGenCUDA/amdgpu-code-object-version-linking.cu
  clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu
  clang/test/CodeGenOpenCL/opencl_types.cl
  clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
  openmp/libomptarget/DeviceRTL/CMakeLists.txt
  openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
  openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h

Index: openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
===
--- openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
+++ openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
@@ -25,6 +25,7 @@
 #include "llvm/Support/MemoryBufferRef.h"
 
 #include "llvm/Support/YAMLTraits.h"
+using namespace llvm::ELF;
 
 namespace llvm {
 namespace omp {
@@ -34,17 +35,25 @@
 
 // The implicit arguments of AMDGPU kernels.
 struct AMDGPUImplicitArgsTy {
-  uint64_t OffsetX;
-  uint64_t OffsetY;
-  uint64_t OffsetZ;
-  uint64_t HostcallPtr;
-  uint64_t Unused0;
-  uint64_t Unused1;
-  uint64_t Unused2;
+  uint32_t BlockCountX;
+  uint32_t BlockCountY;
+  uint32_t BlockCountZ;
+  uint16_t GroupSizeX;
+  uint16_t GroupSizeY;
+  uint16_t GroupSizeZ;
+  uint8_t Unused0[46]; // 46 byte offset.
+  uint16_t GridDims;
+  uint8_t Unused1[190]; // 190 byte offset.
 };
 
-static_assert(sizeof(AMDGPUImplicitArgsTy) == 56,
-  "Unexpected size of implicit arguments");
+enum IMPLICITARGS : uint32_t {
+  COV4_SIZE = 56,
+  COV5_SIZE = 256,
+};
+
+uint16_t getImplicitArgsSize(uint16_t Version) {
+  return Version < ELF::ELFABIVERSION_AMDGPU_HSA_V5 ? COV4_SIZE : COV5_SIZE;
+}
 
 /// Parse a TargetID to get processor arch and feature map.
 /// Returns processor subarch.
@@ -295,7 +304,8 @@
 /// Reads the AMDGPU specific metadata from the ELF file and propagates the
 /// KernelInfoMap
 Error readAMDGPUMetaDataFromImage(MemoryBufferRef MemBuffer,
-  StringMap &KernelInfoMap) {
+  StringMap &KernelInfoMap,
+  uint16_t &ELFABIVersion) {
   Error Err = Error::success(); // Used later as out-parameter
 
   auto ELFOrError = object::ELF64LEFile::create(MemBuffer.getBuffer());
@@ -305,6 +315,12 @@
   const object::ELF64LEFile ELFObj = ELFOrError.get();
   ArrayRef Sections = cantFail(ELFObj.sections());
   KernelInfoReader Reader(KernelInfoMap);
+
+  // Read the code object version from ELF image header
+  auto Header = ELFObj.getHeader();
+  ELFABIVersion = (uint8_t)(Header.e_ident[ELF::EI_ABIVERSION]);
+  DP("ELFABIVERSION Version: %u\n", ELFABIVersion);
+
   for (const auto &S : Sections) {
 if (S.sh_type != ELF::SHT_NOTE)
   continue;
Index: openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
===
--- openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -381,6 +381,9 @@
   /// Get the executable.
   hsa_executable_t getExecutable() const { return Executable; }
 
+  /// Get to Code Object Version of the ELF
+  uint16_t getELFABIVersion() const { return ELFABIVersion; }
+
   /// Find an HSA device symbol by its name on the executable.
   Expected
   findDeviceSymbol(GenericDeviceTy &Device, StringRef SymbolName) const;
@@ -401,6 +404,7 @@
   hsa_executable_t Executable;
   hsa_code_object_t CodeObject;
   StringMap KernelInfoMap;
+  uint16_t ELFABIVersion;
 };
 
 /// Class implementing the AMDGPU kernel functionalities which derives from the
@@ -408,8 +412,7 @@
 struct AMDGPUKernelTy : public GenericKernelTy {
   /// Create an AMDGPU kernel with a name and an execution mode.
   AMDGPUKernelTy(const char *Name, OMPTgtExecModeFlags ExecutionMode)
-  : GenericKernelTy(Name, ExecutionMode),
-ImplicitArgsSize(sizeof(utils::AMDGPUImplicitArgsTy)) {}
+  : GenericKernelTy(Name, ExecutionMode) {}
 
   /// Initialize the AMDGPU kernel.
   Error initImpl(GenericDeviceTy &Device, DeviceImageTy &Image) override {
@@ -450,6 +453,9 @@
 // TODO: Read the kernel descriptor for the max threads per block. May be
 // read from the image.
 
+ImplicitArgsSize = utils::getImplicitArgsSize(AMDImage.getELFABIVersion());
+DP("ELFABIVersion: %d\n", AMDImage.getELFABIVersion());
+
 // Get additional kernel info read from image
 KernelI

[PATCH] D139730: [OpenMP][DeviceRTL][AMDGPU] Support code object version 5

2023-08-23 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl added inline comments.



Comment at: clang/test/CodeGenCUDA/amdgpu-code-object-version-linking.cu:12
+// RUN: llvm-link %t_0 %t_5 -o -| llvm-dis -o - | FileCheck 
-check-prefix=LINKED5 %s
+
+#include "Inputs/cuda.h"

need to test using clang -cc1 with -O3 and -mlink-builtin-bitcode to link the 
device lib and verify the load of llvm.amdgcn.abi.version being eliminated 
after optimization.

I think currently it cannot do that since llvm.amdgcn.abi.version is not 
internalized by the internalization pass. This can cause some significant perf 
drops since loading is expensive. Need to tweak the function controlling what 
variables can be internalized for amdgpu so that this variable gets 
internalized, or having a generic way to tell that function which variables 
should be internalized, e.g. by adding a metadata amdgcn.internalize


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D139730/new/

https://reviews.llvm.org/D139730

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D139730: [OpenMP][DeviceRTL][AMDGPU] Support code object version 5

2023-08-23 Thread Matt Arsenault via Phabricator via cfe-commits
arsenm added a comment.

Codegen parts LGTM, questions with the driver parts




Comment at: clang/lib/Driver/ToolChain.cpp:1368
 if (A->getOption().matches(options::OPT_m_Group)) {
-  if (SameTripleAsHost)
+  // Pass code objection version to device toolchain
+  // to correctly set meta-data in intermediate files.

Typos



Comment at: clang/lib/Driver/ToolChain.cpp:1369
+  // Pass code objection version to device toolchain
+  // to correctly set meta-data in intermediate files.
+  if (SameTripleAsHost ||





Comment at: clang/lib/Driver/ToolChains/Clang.cpp:8649-8650
 
+  // code-object-version=X needs to be passed to clang-linker-wrapper to ensure
+  // that it is used by lld.
+  if (const Arg *A = Args.getLastArg(options::OPT_mcode_object_version_EQ)) {

so device rtl is linked once as a normal library?



Comment at: clang/lib/Driver/ToolChains/Clang.cpp:8653-8654
+CmdArgs.push_back(Args.MakeArgString("-mllvm"));
+CmdArgs.push_back(Args.MakeArgString(
+Twine("--amdhsa-code-object-version=") + A->getValue()));
+  }

Why do you need this? The code object version is supposed to come from a module 
flag. We should be getting rid of the command line argument for it



Comment at: clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp:406-410
+  // pass on -mllvm options to the clang
+  for (const opt::Arg *Arg : Args.filtered(OPT_mllvm)) {
+CmdArgs.push_back("-mllvm");
+CmdArgs.push_back(Arg->getValue());
+  }

Shouldn't need this?



Comment at: clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp:417
 CmdArgs.push_back("-save-temps");
+// CmdArgs.push_back(Args.MakeArgString("--amdhsa-code-object-version=5"));
+  }

Commented out code


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D139730/new/

https://reviews.llvm.org/D139730

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D139730: [OpenMP][DeviceRTL][AMDGPU] Support code object version 5

2023-08-22 Thread Saiyedul Islam via Phabricator via cfe-commits
saiislam updated this revision to Diff 552344.
saiislam marked 5 inline comments as done.
saiislam added a comment.

Used CreateConstInBoundsGEP1_32 for emitting GEP statements. Changed lambda 
function to simple fucntion body for defining the global variable.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D139730/new/

https://reviews.llvm.org/D139730

Files:
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/lib/CodeGen/CodeGenModule.cpp
  clang/lib/CodeGen/CodeGenModule.h
  clang/lib/CodeGen/TargetInfo.h
  clang/lib/CodeGen/Targets/AMDGPU.cpp
  clang/lib/Driver/ToolChain.cpp
  clang/lib/Driver/ToolChains/Clang.cpp
  clang/test/CodeGenCUDA/amdgpu-code-object-version-linking.cu
  clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu
  clang/test/CodeGenOpenCL/opencl_types.cl
  clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
  openmp/libomptarget/DeviceRTL/CMakeLists.txt
  openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
  openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h

Index: openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
===
--- openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
+++ openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
@@ -25,6 +25,7 @@
 #include "llvm/Support/MemoryBufferRef.h"
 
 #include "llvm/Support/YAMLTraits.h"
+using namespace llvm::ELF;
 
 namespace llvm {
 namespace omp {
@@ -34,17 +35,25 @@
 
 // The implicit arguments of AMDGPU kernels.
 struct AMDGPUImplicitArgsTy {
-  uint64_t OffsetX;
-  uint64_t OffsetY;
-  uint64_t OffsetZ;
-  uint64_t HostcallPtr;
-  uint64_t Unused0;
-  uint64_t Unused1;
-  uint64_t Unused2;
+  uint32_t BlockCountX;
+  uint32_t BlockCountY;
+  uint32_t BlockCountZ;
+  uint16_t GroupSizeX;
+  uint16_t GroupSizeY;
+  uint16_t GroupSizeZ;
+  uint8_t Unused0[46]; // 46 byte offset.
+  uint16_t GridDims;
+  uint8_t Unused1[190]; // 190 byte offset.
 };
 
-static_assert(sizeof(AMDGPUImplicitArgsTy) == 56,
-  "Unexpected size of implicit arguments");
+enum IMPLICITARGS : uint32_t {
+  COV4_SIZE = 56,
+  COV5_SIZE = 256,
+};
+
+uint16_t getImplicitArgsSize(uint16_t Version) {
+  return Version < ELF::ELFABIVERSION_AMDGPU_HSA_V5 ? COV4_SIZE : COV5_SIZE;
+}
 
 /// Parse a TargetID to get processor arch and feature map.
 /// Returns processor subarch.
@@ -295,7 +304,8 @@
 /// Reads the AMDGPU specific metadata from the ELF file and propagates the
 /// KernelInfoMap
 Error readAMDGPUMetaDataFromImage(MemoryBufferRef MemBuffer,
-  StringMap &KernelInfoMap) {
+  StringMap &KernelInfoMap,
+  uint16_t &ELFABIVersion) {
   Error Err = Error::success(); // Used later as out-parameter
 
   auto ELFOrError = object::ELF64LEFile::create(MemBuffer.getBuffer());
@@ -305,6 +315,12 @@
   const object::ELF64LEFile ELFObj = ELFOrError.get();
   ArrayRef Sections = cantFail(ELFObj.sections());
   KernelInfoReader Reader(KernelInfoMap);
+
+  // Read the code object version from ELF image header
+  auto Header = ELFObj.getHeader();
+  ELFABIVersion = (uint8_t)(Header.e_ident[ELF::EI_ABIVERSION]);
+  DP("ELFABIVERSION Version: %u\n", ELFABIVersion);
+
   for (const auto &S : Sections) {
 if (S.sh_type != ELF::SHT_NOTE)
   continue;
Index: openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
===
--- openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -381,6 +381,9 @@
   /// Get the executable.
   hsa_executable_t getExecutable() const { return Executable; }
 
+  /// Get to Code Object Version of the ELF
+  uint16_t getELFABIVersion() const { return ELFABIVersion; }
+
   /// Find an HSA device symbol by its name on the executable.
   Expected
   findDeviceSymbol(GenericDeviceTy &Device, StringRef SymbolName) const;
@@ -401,6 +404,7 @@
   hsa_executable_t Executable;
   hsa_code_object_t CodeObject;
   StringMap KernelInfoMap;
+  uint16_t ELFABIVersion;
 };
 
 /// Class implementing the AMDGPU kernel functionalities which derives from the
@@ -408,8 +412,7 @@
 struct AMDGPUKernelTy : public GenericKernelTy {
   /// Create an AMDGPU kernel with a name and an execution mode.
   AMDGPUKernelTy(const char *Name, OMPTgtExecModeFlags ExecutionMode)
-  : GenericKernelTy(Name, ExecutionMode),
-ImplicitArgsSize(sizeof(utils::AMDGPUImplicitArgsTy)) {}
+  : GenericKernelTy(Name, ExecutionMode) {}
 
   /// Initialize the AMDGPU kernel.
   Error initImpl(GenericDeviceTy &Device, DeviceImageTy &Image) override {
@@ -450,6 +453,9 @@
 // TODO: Read the kernel descriptor for the max threads per block. May be
 // read from the image.
 
+ImplicitArgsSize = utils::getImplicitArgsSize(AMDImage.getELFABIVersion());
+DP("ELFABIVersion: %d\n", AMDImage.getELFABIVersion());

[PATCH] D139730: [OpenMP][DeviceRTL][AMDGPU] Support code object version 5

2023-08-21 Thread Matt Arsenault via Phabricator via cfe-commits
arsenm added inline comments.



Comment at: clang/lib/CodeGen/CGBuiltin.cpp:17067
+
+Value *Iscov5 = CGF.Builder.CreateICmpSGE(
+ABIVersion,

Capitalization is weird, IsCOV5?



Comment at: clang/lib/CodeGen/CGBuiltin.cpp:17082-17083
+Value *DispatchPtr = EmitAMDGPUDispatchPtr(CGF);
+auto *DispatchGEP =
+CGF.Builder.CreateGEP(CGF.Int8Ty, DispatchPtr, DispatchOffset);
+

CreateConstInBoundsGEP1_64



Comment at: clang/lib/CodeGen/CGBuiltin.cpp:17100
+}
+auto *GEP = CGF.Builder.CreateGEP(CGF.Int8Ty, ArgPtr, Offset);
+LD = CGF.Builder.CreateLoad(

CreateConstInBoundsGEP1_64



Comment at: clang/lib/CodeGen/Targets/AMDGPU.cpp:364
+CodeGen::CodeGenModule &CGM) const {
+  auto AddGlobal = [&](StringRef Name,
+   clang::TargetOptions::CodeObjectVersionKind Value,

Single use lamdba, just make this the function body



Comment at: clang/lib/CodeGen/Targets/AMDGPU.cpp:381
+GV->setVisibility(llvm::GlobalValue::VisibilityTypes::HiddenVisibility);
+GV->setAlignment(CGM.getDataLayout().getABITypeAlign(Type));
+  };

No real point setting the alignment


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D139730/new/

https://reviews.llvm.org/D139730

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D139730: [OpenMP][DeviceRTL][AMDGPU] Support code object version 5

2023-08-21 Thread Saiyedul Islam via Phabricator via cfe-commits
saiislam updated this revision to Diff 552085.
saiislam marked an inline comment as done.
saiislam added a comment.

Adressed reviewer's comments.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D139730/new/

https://reviews.llvm.org/D139730

Files:
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/lib/CodeGen/CodeGenModule.cpp
  clang/lib/CodeGen/CodeGenModule.h
  clang/lib/CodeGen/TargetInfo.h
  clang/lib/CodeGen/Targets/AMDGPU.cpp
  clang/lib/Driver/ToolChain.cpp
  clang/lib/Driver/ToolChains/Clang.cpp
  clang/test/CodeGenCUDA/amdgpu-code-object-version-linking.cu
  clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu
  clang/test/CodeGenOpenCL/opencl_types.cl
  clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
  openmp/libomptarget/DeviceRTL/CMakeLists.txt
  openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
  openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h

Index: openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
===
--- openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
+++ openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
@@ -25,6 +25,7 @@
 #include "llvm/Support/MemoryBufferRef.h"
 
 #include "llvm/Support/YAMLTraits.h"
+using namespace llvm::ELF;
 
 namespace llvm {
 namespace omp {
@@ -34,17 +35,25 @@
 
 // The implicit arguments of AMDGPU kernels.
 struct AMDGPUImplicitArgsTy {
-  uint64_t OffsetX;
-  uint64_t OffsetY;
-  uint64_t OffsetZ;
-  uint64_t HostcallPtr;
-  uint64_t Unused0;
-  uint64_t Unused1;
-  uint64_t Unused2;
+  uint32_t BlockCountX;
+  uint32_t BlockCountY;
+  uint32_t BlockCountZ;
+  uint16_t GroupSizeX;
+  uint16_t GroupSizeY;
+  uint16_t GroupSizeZ;
+  uint8_t Unused0[46]; // 46 byte offset.
+  uint16_t GridDims;
+  uint8_t Unused1[190]; // 190 byte offset.
 };
 
-static_assert(sizeof(AMDGPUImplicitArgsTy) == 56,
-  "Unexpected size of implicit arguments");
+enum IMPLICITARGS : uint32_t {
+  COV4_SIZE = 56,
+  COV5_SIZE = 256,
+};
+
+uint16_t getImplicitArgsSize(uint16_t Version) {
+  return Version < ELF::ELFABIVERSION_AMDGPU_HSA_V5 ? COV4_SIZE : COV5_SIZE;
+}
 
 /// Parse a TargetID to get processor arch and feature map.
 /// Returns processor subarch.
@@ -295,7 +304,8 @@
 /// Reads the AMDGPU specific metadata from the ELF file and propagates the
 /// KernelInfoMap
 Error readAMDGPUMetaDataFromImage(MemoryBufferRef MemBuffer,
-  StringMap &KernelInfoMap) {
+  StringMap &KernelInfoMap,
+  uint16_t &ELFABIVersion) {
   Error Err = Error::success(); // Used later as out-parameter
 
   auto ELFOrError = object::ELF64LEFile::create(MemBuffer.getBuffer());
@@ -305,6 +315,12 @@
   const object::ELF64LEFile ELFObj = ELFOrError.get();
   ArrayRef Sections = cantFail(ELFObj.sections());
   KernelInfoReader Reader(KernelInfoMap);
+
+  // Read the code object version from ELF image header
+  auto Header = ELFObj.getHeader();
+  ELFABIVersion = (uint8_t)(Header.e_ident[ELF::EI_ABIVERSION]);
+  DP("ELFABIVERSION Version: %u\n", ELFABIVersion);
+
   for (const auto &S : Sections) {
 if (S.sh_type != ELF::SHT_NOTE)
   continue;
Index: openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
===
--- openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -381,6 +381,9 @@
   /// Get the executable.
   hsa_executable_t getExecutable() const { return Executable; }
 
+  /// Get to Code Object Version of the ELF
+  uint16_t getELFABIVersion() const { return ELFABIVersion; }
+
   /// Find an HSA device symbol by its name on the executable.
   Expected
   findDeviceSymbol(GenericDeviceTy &Device, StringRef SymbolName) const;
@@ -401,6 +404,7 @@
   hsa_executable_t Executable;
   hsa_code_object_t CodeObject;
   StringMap KernelInfoMap;
+  uint16_t ELFABIVersion;
 };
 
 /// Class implementing the AMDGPU kernel functionalities which derives from the
@@ -408,8 +412,7 @@
 struct AMDGPUKernelTy : public GenericKernelTy {
   /// Create an AMDGPU kernel with a name and an execution mode.
   AMDGPUKernelTy(const char *Name, OMPTgtExecModeFlags ExecutionMode)
-  : GenericKernelTy(Name, ExecutionMode),
-ImplicitArgsSize(sizeof(utils::AMDGPUImplicitArgsTy)) {}
+  : GenericKernelTy(Name, ExecutionMode) {}
 
   /// Initialize the AMDGPU kernel.
   Error initImpl(GenericDeviceTy &Device, DeviceImageTy &Image) override {
@@ -450,6 +453,9 @@
 // TODO: Read the kernel descriptor for the max threads per block. May be
 // read from the image.
 
+ImplicitArgsSize = utils::getImplicitArgsSize(AMDImage.getELFABIVersion());
+DP("ELFABIVersion: %d\n", AMDImage.getELFABIVersion());
+
 // Get additional kernel info read from image
 KernelInfo = AMDImage.getKernelInfo(getName());
 if

[PATCH] D139730: [OpenMP][DeviceRTL][AMDGPU] Support code object version 5

2023-08-21 Thread Saiyedul Islam via Phabricator via cfe-commits
saiislam marked 4 inline comments as done.
saiislam added inline comments.



Comment at: clang/lib/CodeGen/Targets/AMDGPU.cpp:383
+CGM.getTarget().getTargetOpts().CodeObjectVersion, /*Size=*/32,
+llvm::GlobalValue::WeakODRLinkage);
+}

yaxunl wrote:
> 
> I am not sure weak_odr linkage will work when code object version is none. 
> This will cause conflict when a module emitted with cov none is linked with a 
> module emitted with cov4 or cov5. Also, when all modules are emitted with cov 
> none, we end up with a linked module with cov none and the work group size 
> code will not work.
> 
> Probably we need to emit llvm.amdgcn.abi.version with external linkage for 
> cov none.
> 
> Another issue is that llvm.amdgcn.abi.version is not internalized. It is 
> always loaded from memory even though it is in constant address space. This 
> will cause bad performance. Considering device libs may use clang builtin for 
> workgroup size. The performance impact may be significant. To avoid 
> performance degradation, we need to internalize it as early as possible in 
> the optimization pipeline.
I tried external linkage but it didn't work. Only weak_odr is working fine.



Comment at: clang/lib/CodeGen/Targets/AMDGPU.cpp:369-386
+if (CGM.getModule().getNamedGlobal(Name))
+  return;
+
+auto *Type =
+llvm::IntegerType::getIntNTy(CGM.getModule().getContext(), Size);
+auto *GV = new llvm::GlobalVariable(
+CGM.getModule(), Type, true, Linkage,

arsenm wrote:
> You moved GetOrCreateLLVMGlobal but don't use it? 
> 
> The lamdba is unnecessary for a single local use
I am using GetOrCreateLLVMGlobal in CGBuiltin.cpp while emitting code for 
amdgpu_worgroup_size.



Comment at: clang/lib/CodeGen/Targets/AMDGPU.cpp:369-386
+if (CGM.getModule().getNamedGlobal(Name))
+  return;
+
+auto *Type =
+llvm::IntegerType::getIntNTy(CGM.getModule().getContext(), Size);
+auto *GV = new llvm::GlobalVariable(
+CGM.getModule(), Type, true, Linkage,

saiislam wrote:
> arsenm wrote:
> > You moved GetOrCreateLLVMGlobal but don't use it? 
> > 
> > The lamdba is unnecessary for a single local use
> I am using GetOrCreateLLVMGlobal in CGBuiltin.cpp while emitting code for 
> amdgpu_worgroup_size.
I was hoping that this patch will pave way for D130096, so that it can generate 
rest of the control constants using the same lambda.
I can remove this and simplify the code if you want.



Comment at: clang/lib/Driver/ToolChain.cpp:1372
+  if (SameTripleAsHost ||
+  A->getOption().matches(options::OPT_mcode_object_version_EQ))
 DAL->append(A);

arsenm wrote:
> Don't understand why this is necessary
This function creates a derived argument list for OpenMP target specific flags.
`mcode-object-version` remains unset for device compilation step if we don't 
pass it here.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D139730/new/

https://reviews.llvm.org/D139730

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D139730: [OpenMP][DeviceRTL][AMDGPU] Support code object version 5

2023-08-18 Thread Matt Arsenault via Phabricator via cfe-commits
arsenm added inline comments.



Comment at: clang/test/CodeGenCUDA/amdgpu-code-object-version-linking.cu:40-43
+__device__ void bar(int *out)
+{
+  *out = __builtin_amdgcn_workgroup_size_x();
+}

test all the builtins?


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D139730/new/

https://reviews.llvm.org/D139730

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D139730: [OpenMP][DeviceRTL][AMDGPU] Support code object version 5

2023-08-18 Thread Matt Arsenault via Phabricator via cfe-commits
arsenm added inline comments.



Comment at: clang/lib/CodeGen/CGBuiltin.cpp:17057
+  Constant *Offset, *OffsetOld;
+  Value *DP, *DP1;
+

Spell out to DispatchPtr?



Comment at: clang/lib/CodeGen/CodeGenModule.cpp:1206-1208
+  getTargetCodeGenInfo().emitTargetGlobals(*this);
+
   getTargetCodeGenInfo().emitTargetMetadata(*this, MangledDeclNames);

These could be one combined hook? this isn't really different from metadata



Comment at: clang/lib/CodeGen/Targets/AMDGPU.cpp:369-386
+if (CGM.getModule().getNamedGlobal(Name))
+  return;
+
+auto *Type =
+llvm::IntegerType::getIntNTy(CGM.getModule().getContext(), Size);
+auto *GV = new llvm::GlobalVariable(
+CGM.getModule(), Type, true, Linkage,

You moved GetOrCreateLLVMGlobal but don't use it? 

The lamdba is unnecessary for a single local use



Comment at: clang/lib/Driver/ToolChain.cpp:1369
 if (A->getOption().matches(options::OPT_m_Group)) {
-  if (SameTripleAsHost)
+  // pass code objection version to device toolchain
+  // to correctly set meta-data in intermediate files

Capitalize



Comment at: clang/lib/Driver/ToolChain.cpp:1372
+  if (SameTripleAsHost ||
+  A->getOption().matches(options::OPT_mcode_object_version_EQ))
 DAL->append(A);

Don't understand why this is necessary


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D139730/new/

https://reviews.llvm.org/D139730

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D139730: [OpenMP][DeviceRTL][AMDGPU] Support code object version 5

2023-08-18 Thread Saiyedul Islam via Phabricator via cfe-commits
saiislam added a comment.

In D139730#4597504 , @jhuber6 wrote:

> Some nits. I'm assuming we're getting the code object in the backend now? 
> We'll need to make sure that `-Wl,--amdhsa-code-object-version` is passed to 
> the clang invocation inside of the `clang-linker-wrapper` to handle 
> `-save-temps` mode.

Clang-linker-wrapper was not passing `-mllvm` option to the clang backend.




Comment at: openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h:36
 
-// The implicit arguments of AMDGPU kernels.
-struct AMDGPUImplicitArgsTy {
-  uint64_t OffsetX;
-  uint64_t OffsetY;
-  uint64_t OffsetZ;
-  uint64_t HostcallPtr;
-  uint64_t Unused0;
-  uint64_t Unused1;
-  uint64_t Unused2;
+enum IMPLICITARGS : uint32_t {
+  COV4_SIZE = 56,

jhuber6 wrote:
> I'm still not a fan of replacing the struct. The mnemonic of having a struct 
> is much more user friendly.
> ```
> ImplicitArgsTy Args{};
> std::memset(&Args, sizeof(ImplicitArgsTy), 0);
> ...
> ```
> If we don't use something, just make it some random bytes, e.g.
> ```
> struct ImplicitArgsTy {
>   uint64_t OffsetX;
>   uint8_t Unused[64]; // 64 byte offset.
> };
> ```
Replaced.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D139730/new/

https://reviews.llvm.org/D139730

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D139730: [OpenMP][DeviceRTL][AMDGPU] Support code object version 5

2023-08-18 Thread Saiyedul Islam via Phabricator via cfe-commits
saiislam updated this revision to Diff 551597.
saiislam marked 4 inline comments as done.
saiislam added a comment.

Changed ImplitArgs implementation using struct.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D139730/new/

https://reviews.llvm.org/D139730

Files:
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/lib/CodeGen/CodeGenModule.cpp
  clang/lib/CodeGen/CodeGenModule.h
  clang/lib/CodeGen/TargetInfo.h
  clang/lib/CodeGen/Targets/AMDGPU.cpp
  clang/lib/Driver/ToolChain.cpp
  clang/lib/Driver/ToolChains/Clang.cpp
  clang/test/CodeGenCUDA/amdgpu-code-object-version-linking.cu
  clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu
  clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
  openmp/libomptarget/DeviceRTL/CMakeLists.txt
  openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
  openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h

Index: openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
===
--- openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
+++ openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
@@ -25,6 +25,7 @@
 #include "llvm/Support/MemoryBufferRef.h"
 
 #include "llvm/Support/YAMLTraits.h"
+using namespace llvm::ELF;
 
 namespace llvm {
 namespace omp {
@@ -34,17 +35,25 @@
 
 // The implicit arguments of AMDGPU kernels.
 struct AMDGPUImplicitArgsTy {
-  uint64_t OffsetX;
-  uint64_t OffsetY;
-  uint64_t OffsetZ;
-  uint64_t HostcallPtr;
-  uint64_t Unused0;
-  uint64_t Unused1;
-  uint64_t Unused2;
+  uint32_t BlockCountX;
+  uint32_t BlockCountY;
+  uint32_t BlockCountZ;
+  uint16_t GroupSizeX;
+  uint16_t GroupSizeY;
+  uint16_t GroupSizeZ;
+  uint8_t Unused0[46]; // 46 byte offset.
+  uint16_t GridDims;
+  uint8_t Unused1[190]; // 190 byte offset.
 };
 
-static_assert(sizeof(AMDGPUImplicitArgsTy) == 56,
-  "Unexpected size of implicit arguments");
+enum IMPLICITARGS : uint32_t {
+  COV4_SIZE = 56,
+  COV5_SIZE = 256,
+};
+
+uint16_t getImplicitArgsSize(uint16_t Version) {
+  return Version < ELF::ELFABIVERSION_AMDGPU_HSA_V5 ? COV4_SIZE : COV5_SIZE;
+}
 
 /// Parse a TargetID to get processor arch and feature map.
 /// Returns processor subarch.
@@ -295,7 +304,8 @@
 /// Reads the AMDGPU specific metadata from the ELF file and propagates the
 /// KernelInfoMap
 Error readAMDGPUMetaDataFromImage(MemoryBufferRef MemBuffer,
-  StringMap &KernelInfoMap) {
+  StringMap &KernelInfoMap,
+  uint16_t &ELFABIVersion) {
   Error Err = Error::success(); // Used later as out-parameter
 
   auto ELFOrError = object::ELF64LEFile::create(MemBuffer.getBuffer());
@@ -305,6 +315,12 @@
   const object::ELF64LEFile ELFObj = ELFOrError.get();
   ArrayRef Sections = cantFail(ELFObj.sections());
   KernelInfoReader Reader(KernelInfoMap);
+
+  // Read the code object version from ELF image header
+  auto Header = ELFObj.getHeader();
+  ELFABIVersion = (uint8_t)(Header.e_ident[ELF::EI_ABIVERSION]);
+  DP("ELFABIVERSION Version: %u\n", ELFABIVersion);
+
   for (const auto &S : Sections) {
 if (S.sh_type != ELF::SHT_NOTE)
   continue;
Index: openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
===
--- openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -381,6 +381,9 @@
   /// Get the executable.
   hsa_executable_t getExecutable() const { return Executable; }
 
+  /// Get to Code Object Version of the ELF
+  uint16_t getELFABIVersion() const { return ELFABIVersion; }
+
   /// Find an HSA device symbol by its name on the executable.
   Expected
   findDeviceSymbol(GenericDeviceTy &Device, StringRef SymbolName) const;
@@ -401,6 +404,7 @@
   hsa_executable_t Executable;
   hsa_code_object_t CodeObject;
   StringMap KernelInfoMap;
+  uint16_t ELFABIVersion;
 };
 
 /// Class implementing the AMDGPU kernel functionalities which derives from the
@@ -408,8 +412,7 @@
 struct AMDGPUKernelTy : public GenericKernelTy {
   /// Create an AMDGPU kernel with a name and an execution mode.
   AMDGPUKernelTy(const char *Name, OMPTgtExecModeFlags ExecutionMode)
-  : GenericKernelTy(Name, ExecutionMode),
-ImplicitArgsSize(sizeof(utils::AMDGPUImplicitArgsTy)) {}
+  : GenericKernelTy(Name, ExecutionMode) {}
 
   /// Initialize the AMDGPU kernel.
   Error initImpl(GenericDeviceTy &Device, DeviceImageTy &Image) override {
@@ -450,6 +453,12 @@
 // TODO: Read the kernel descriptor for the max threads per block. May be
 // read from the image.
 
+ImplicitArgsSize =
+(AMDImage.getELFABIVersion() < llvm::ELF::ELFABIVERSION_AMDGPU_HSA_V5)
+? utils::COV4_SIZE
+: utils::COV5_SIZE;
+DP("ELFABIVersion: %d\n", AMDImage.getELFABIVersion());
+
 // Get additional kernel info read from im

[PATCH] D139730: [OpenMP][DeviceRTL][AMDGPU] Support code object version 5

2023-08-17 Thread Joseph Huber via Phabricator via cfe-commits
jhuber6 added a comment.

Some nits. I'm assuming we're getting the code object in the backend now? We'll 
need to make sure that `-Wl,--amdhsa-code-object-version` is passed to the 
clang invocation inside of the `clang-linker-wrapper` to handle `-save-temps` 
mode.




Comment at: clang/lib/CodeGen/CGBuiltin.cpp:17053
+/// Note: "llvm.amdgcn.abi.version" is supposed to be emitted and intialized by
+///   the clang during compilation of user code.
 Value *EmitAMDGPUWorkGroupSize(CodeGenFunction &CGF, unsigned Index) {





Comment at: clang/lib/Driver/ToolChain.cpp:1363
   for (auto *A : Args) {
+
 // Exclude flags which may only apply to the host toolchain.

Random whitespace.



Comment at: clang/test/CodeGenCUDA/amdgpu-code-object-version-linking.cu:54
+#endif
\ No newline at end of file


Need newline



Comment at: openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp:3016
+  if (getImplicitArgsSize() < utils::COV5_SIZE) {
+DP("Setting fields of ImplicitArgs for COV4\n");
+  } else {

Don't think this needs to be a debug message, same below



Comment at: openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h:36
 
-// The implicit arguments of AMDGPU kernels.
-struct AMDGPUImplicitArgsTy {
-  uint64_t OffsetX;
-  uint64_t OffsetY;
-  uint64_t OffsetZ;
-  uint64_t HostcallPtr;
-  uint64_t Unused0;
-  uint64_t Unused1;
-  uint64_t Unused2;
+enum IMPLICITARGS : uint32_t {
+  COV4_SIZE = 56,

I'm still not a fan of replacing the struct. The mnemonic of having a struct is 
much more user friendly.
```
ImplicitArgsTy Args{};
std::memset(&Args, sizeof(ImplicitArgsTy), 0);
...
```
If we don't use something, just make it some random bytes, e.g.
```
struct ImplicitArgsTy {
  uint64_t OffsetX;
  uint8_t Unused[64]; // 64 byte offset.
};
```


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D139730/new/

https://reviews.llvm.org/D139730

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D139730: [OpenMP][DeviceRTL][AMDGPU] Support code object version 5

2023-08-17 Thread Saiyedul Islam via Phabricator via cfe-commits
saiislam added inline comments.



Comment at: clang/lib/CodeGen/CGBuiltin.cpp:17143-17145
+  llvm::LoadInst *LD;
+  Constant *Offset, *Offset1;
+  Value *DP, *DP1;

arsenm wrote:
> Move down to define and initialize
There are multiple uses of the same identifier. Defining them four times looks 
odd.



Comment at: openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp:2550-2551
+Error Err = retrieveAllMemoryPools();
+if (Err)
+  return Plugin::error("Unable to retieve all memmory pools");
+

jhuber6 wrote:
> This and below isn't correct. You can't discard an `llvm::Error` value like 
> this without either doing `consumeError(std::move(Err))` or 
> `toString(std::move(Err))`. However, you don't need to consume these in the 
> first place, they already contain the error message from the callee and 
> should just be forwarded.
Removed the logic for preallocatedheap.



Comment at: openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp:1752
+// if (auto Err = preAllocateDeviceMemoryPool())
+//   return Err;
+

jhuber6 wrote:
> saiislam wrote:
> > jhuber6 wrote:
> > > Leftoever?
> > No, it is not a left over.
> > One of the fields in cov5 implicitikernarg is heap_v1 ptr. It should point 
> > to a 128KB zero-initialized block of coarse-grained memory on each device 
> > before launching the kernel. This code was working a while ago, but right 
> > now it is failing most likely due to some latest change in devicertl memory 
> > handling mechanism.
> > I need to debug it with this patch, otherwise it will cause all target 
> > region code calling device-malloc to fail.
> > I will try to fix it before the next revision.
> Do we really need that? We only use a fraction of the existing implicit 
> arguments. My understanding is that most of these are more for runtime 
> handling for HIP and OpenCL while we would most likely want our own solution. 
> I'm assuming that the 128KB is not required for anything we use?
I have removed the preallocatedheap work from this patch.



Comment at: openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h:36
 
-// The implicit arguments of AMDGPU kernels.
-struct AMDGPUImplicitArgsTy {
-  uint64_t OffsetX;
-  uint64_t OffsetY;
-  uint64_t OffsetZ;
-  uint64_t HostcallPtr;
-  uint64_t Unused0;
-  uint64_t Unused1;
-  uint64_t Unused2;
+enum IMPLICITARGS : uint32_t {
+  COV4_SIZE = 56,

jhuber6 wrote:
> saiislam wrote:
> > jhuber6 wrote:
> > > arsenm wrote:
> > > > This is getting duplicated a few places, should it move to a support 
> > > > header?
> > > > 
> > > > I don't love the existing APIs for this, I think a struct definition 
> > > > makes more sense
> > > The other user here is my custom loader, @JonChesterfield has talked 
> > > about wanting a common HSA helper header for awhile now.
> > > 
> > > I agree that the struct definition is much better. Being able to simply 
> > > allocate this size and then zero fill it is much cleaner.
> > Defining a struct for whole 256 byte of implicitargs in cov5 was becoming a 
> > little difficult due to different sizes of various fields (2, 4, 6, 8, 48, 
> > 72 bytes) along with multiple reserved fields in between. It made sense for 
> > cov4 because it only had 7 fields of 8 bytes each, where we needed only 4th 
> > field in OpenMP runtime (for hostcall_buffer).
> > 
> > Offset based lookups like the following allows handling/exposing only 
> > required fields across generations of ABI.
> If we don't use it, just put it as `unused`. It's really hard to read as-is 
> and it makes it more difficult to just zero fill.
I have reduced the fields to bare minimum required for OpenMP.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D139730/new/

https://reviews.llvm.org/D139730

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D139730: [OpenMP][DeviceRTL][AMDGPU] Support code object version 5

2023-08-17 Thread Saiyedul Islam via Phabricator via cfe-commits
saiislam updated this revision to Diff 551266.
saiislam marked 6 inline comments as done.
saiislam added a comment.

Updated the patch as per reviewers comments.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D139730/new/

https://reviews.llvm.org/D139730

Files:
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/lib/CodeGen/CodeGenModule.cpp
  clang/lib/CodeGen/CodeGenModule.h
  clang/lib/CodeGen/TargetInfo.h
  clang/lib/CodeGen/Targets/AMDGPU.cpp
  clang/lib/Driver/ToolChain.cpp
  clang/lib/Driver/ToolChains/Clang.cpp
  clang/test/CodeGenCUDA/amdgpu-code-object-version-linking.cu
  clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu
  openmp/libomptarget/DeviceRTL/CMakeLists.txt
  openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
  openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h

Index: openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
===
--- openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
+++ openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
@@ -25,6 +25,7 @@
 #include "llvm/Support/MemoryBufferRef.h"
 
 #include "llvm/Support/YAMLTraits.h"
+using namespace llvm::ELF;
 
 namespace llvm {
 namespace omp {
@@ -32,19 +33,29 @@
 namespace plugin {
 namespace utils {
 
-// The implicit arguments of AMDGPU kernels.
-struct AMDGPUImplicitArgsTy {
-  uint64_t OffsetX;
-  uint64_t OffsetY;
-  uint64_t OffsetZ;
-  uint64_t HostcallPtr;
-  uint64_t Unused0;
-  uint64_t Unused1;
-  uint64_t Unused2;
+enum IMPLICITARGS : uint32_t {
+  COV4_SIZE = 56,
+  COV5_SIZE = 256,
+
+  COV5_BLOCK_COUNT_X_OFFSET = 0,
+  COV5_BLOCK_COUNT_X_SIZE = 4,
+
+  COV5_GROUP_SIZE_X_OFFSET = 12,
+  COV5_GROUP_SIZE_X_SIZE = 2,
+
+  COV5_GROUP_SIZE_Y_OFFSET = 14,
+  COV5_GROUP_SIZE_Y_SIZE = 2,
+
+  COV5_GROUP_SIZE_Z_OFFSET = 16,
+  COV5_GROUP_SIZE_Z_SIZE = 2,
+
+  COV5_GRID_DIMS_OFFSET = 64,
+  COV5_GRID_DIMS_SIZE = 2,
 };
 
-static_assert(sizeof(AMDGPUImplicitArgsTy) == 56,
-  "Unexpected size of implicit arguments");
+uint16_t getImplicitArgsSize(uint16_t Version) {
+  return Version < ELF::ELFABIVERSION_AMDGPU_HSA_V5 ? 56 : 256;
+}
 
 /// Parse a TargetID to get processor arch and feature map.
 /// Returns processor subarch.
@@ -295,7 +306,8 @@
 /// Reads the AMDGPU specific metadata from the ELF file and propagates the
 /// KernelInfoMap
 Error readAMDGPUMetaDataFromImage(MemoryBufferRef MemBuffer,
-  StringMap &KernelInfoMap) {
+  StringMap &KernelInfoMap,
+  uint16_t &ELFABIVersion) {
   Error Err = Error::success(); // Used later as out-parameter
 
   auto ELFOrError = object::ELF64LEFile::create(MemBuffer.getBuffer());
@@ -305,6 +317,12 @@
   const object::ELF64LEFile ELFObj = ELFOrError.get();
   ArrayRef Sections = cantFail(ELFObj.sections());
   KernelInfoReader Reader(KernelInfoMap);
+
+  // Read the code object version from ELF image header
+  auto Header = ELFObj.getHeader();
+  ELFABIVersion = (uint8_t)(Header.e_ident[ELF::EI_ABIVERSION]);
+  DP("ELFABIVERSION Version: %u\n", ELFABIVersion);
+
   for (const auto &S : Sections) {
 if (S.sh_type != ELF::SHT_NOTE)
   continue;
Index: openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
===
--- openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -253,6 +253,13 @@
 return Plugin::check(Status, "Error in hsa_amd_agents_allow_access: %s");
   }
 
+  Error zeroInitializeMemory(void *Ptr, size_t Size) {
+uint64_t Rounded = sizeof(uint32_t) * ((Size + 3) / sizeof(uint32_t));
+hsa_status_t Status =
+hsa_amd_memory_fill(Ptr, 0, Rounded / sizeof(uint32_t));
+return Plugin::check(Status, "Error in hsa_amd_memory_fill: %s");
+  }
+
   /// Get attribute from the memory pool.
   template 
   Error getAttr(hsa_amd_memory_pool_info_t Kind, Ty &Value) const {
@@ -381,6 +388,9 @@
   /// Get the executable.
   hsa_executable_t getExecutable() const { return Executable; }
 
+  /// Get to Code Object Version of the ELF
+  uint16_t getELFABIVersion() const { return ELFABIVersion; }
+
   /// Find an HSA device symbol by its name on the executable.
   Expected
   findDeviceSymbol(GenericDeviceTy &Device, StringRef SymbolName) const;
@@ -401,6 +411,7 @@
   hsa_executable_t Executable;
   hsa_code_object_t CodeObject;
   StringMap KernelInfoMap;
+  uint16_t ELFABIVersion;
 };
 
 /// Class implementing the AMDGPU kernel functionalities which derives from the
@@ -408,8 +419,7 @@
 struct AMDGPUKernelTy : public GenericKernelTy {
   /// Create an AMDGPU kernel with a name and an execution mode.
   AMDGPUKernelTy(const char *Name, OMPTgtExecModeFlags ExecutionMode)
-  : GenericKernelTy(Name, ExecutionMode),
-ImplicitArgsSize(sizeof(utils::AMDGPUImplicitArgsTy)) {}
+  

[PATCH] D139730: [OpenMP][DeviceRTL][AMDGPU] Support code object version 5

2023-08-07 Thread Matt Arsenault via Phabricator via cfe-commits
arsenm added inline comments.



Comment at: clang/lib/CodeGen/CGBuiltin.cpp:17143-17145
+  llvm::LoadInst *LD;
+  Constant *Offset, *Offset1;
+  Value *DP, *DP1;

Move down to define and initialize



Comment at: clang/lib/CodeGen/CGBuiltin.cpp:17163-17165
+BasicBlock *NewABI = CGF.createBasicBlock("amdgcn.abi.cov5", TheFunction);
+BasicBlock *OldABI = CGF.createBasicBlock("amdgcn.abi.cov4", nullptr);
+BasicBlock *End = CGF.createBasicBlock("amdgcn.abi.end", nullptr);

You could write all of this in terms of selects and avoid introducing all these 
blocks



Comment at: clang/lib/CodeGen/Targets/AMDGPU.cpp:358
+CodeGen::CodeGenModule &CGM) const {
+  if (!CGM.getTriple().isAMDGCN())
+return;

Don't need this?


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D139730/new/

https://reviews.llvm.org/D139730

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D139730: [OpenMP][DeviceRTL][AMDGPU] Support code object version 5

2023-08-07 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl added a comment.

I would suggest separating the clang/llvm part into a separate review.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D139730/new/

https://reviews.llvm.org/D139730

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D139730: [OpenMP][DeviceRTL][AMDGPU] Support code object version 5

2023-08-07 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl added a comment.

need a lit test for the codegen of the clang builtin for cov 4/5/none and a lit 
test to show the branching code generated with cov none can be optimized away 
when linked with cov4 or cov5.




Comment at: clang/lib/CodeGen/Targets/AMDGPU.cpp:383
+CGM.getTarget().getTargetOpts().CodeObjectVersion, /*Size=*/32,
+llvm::GlobalValue::WeakODRLinkage);
+}


I am not sure weak_odr linkage will work when code object version is none. This 
will cause conflict when a module emitted with cov none is linked with a module 
emitted with cov4 or cov5. Also, when all modules are emitted with cov none, we 
end up with a linked module with cov none and the work group size code will not 
work.

Probably we need to emit llvm.amdgcn.abi.version with external linkage for cov 
none.

Another issue is that llvm.amdgcn.abi.version is not internalized. It is always 
loaded from memory even though it is in constant address space. This will cause 
bad performance. Considering device libs may use clang builtin for workgroup 
size. The performance impact may be significant. To avoid performance 
degradation, we need to internalize it as early as possible in the optimization 
pipeline.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D139730/new/

https://reviews.llvm.org/D139730

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D139730: [OpenMP][DeviceRTL][AMDGPU] Support code object version 5

2023-08-07 Thread Joseph Huber via Phabricator via cfe-commits
jhuber6 added inline comments.



Comment at: clang/lib/CodeGen/CGBuiltin.cpp:17138
+///and use its value for COV_4 or COV_5 approach. It is used for
+///compiling device libraries in ABI-agnostic way.
+///





Comment at: clang/lib/CodeGen/CGBuiltin.cpp:17187-17188
+Address(Result, CGF.Int16Ty, CharUnits::fromQuantity(2)));
+  } else {
+if (Cov == clang::TargetOptions::COV_5) {
+  // Indexing the implicit kernarg segment.

saiislam wrote:
> jhuber6 wrote:
> > nit.
> There are a couple of common lines after the inner if-else, in the outer else 
> section.
You should be able to factor out
```
LD = CGF.Builder.CreateLoad(
Address(Result, CGF.Int16Ty, CharUnits::fromQuantity(2)));
```
from both by making each assign the `Result` to a value.



Comment at: openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp:2550-2551
+Error Err = retrieveAllMemoryPools();
+if (Err)
+  return Plugin::error("Unable to retieve all memmory pools");
+

This and below isn't correct. You can't discard an `llvm::Error` value like 
this without either doing `consumeError(std::move(Err))` or 
`toString(std::move(Err))`. However, you don't need to consume these in the 
first place, they already contain the error message from the callee and should 
just be forwarded.



Comment at: openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp:1752
+// if (auto Err = preAllocateDeviceMemoryPool())
+//   return Err;
+

saiislam wrote:
> jhuber6 wrote:
> > Leftoever?
> No, it is not a left over.
> One of the fields in cov5 implicitikernarg is heap_v1 ptr. It should point to 
> a 128KB zero-initialized block of coarse-grained memory on each device before 
> launching the kernel. This code was working a while ago, but right now it is 
> failing most likely due to some latest change in devicertl memory handling 
> mechanism.
> I need to debug it with this patch, otherwise it will cause all target region 
> code calling device-malloc to fail.
> I will try to fix it before the next revision.
Do we really need that? We only use a fraction of the existing implicit 
arguments. My understanding is that most of these are more for runtime handling 
for HIP and OpenCL while we would most likely want our own solution. I'm 
assuming that the 128KB is not required for anything we use?



Comment at: openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h:36
 
-// The implicit arguments of AMDGPU kernels.
-struct AMDGPUImplicitArgsTy {
-  uint64_t OffsetX;
-  uint64_t OffsetY;
-  uint64_t OffsetZ;
-  uint64_t HostcallPtr;
-  uint64_t Unused0;
-  uint64_t Unused1;
-  uint64_t Unused2;
+enum IMPLICITARGS : uint32_t {
+  COV4_SIZE = 56,

saiislam wrote:
> jhuber6 wrote:
> > arsenm wrote:
> > > This is getting duplicated a few places, should it move to a support 
> > > header?
> > > 
> > > I don't love the existing APIs for this, I think a struct definition 
> > > makes more sense
> > The other user here is my custom loader, @JonChesterfield has talked about 
> > wanting a common HSA helper header for awhile now.
> > 
> > I agree that the struct definition is much better. Being able to simply 
> > allocate this size and then zero fill it is much cleaner.
> Defining a struct for whole 256 byte of implicitargs in cov5 was becoming a 
> little difficult due to different sizes of various fields (2, 4, 6, 8, 48, 72 
> bytes) along with multiple reserved fields in between. It made sense for cov4 
> because it only had 7 fields of 8 bytes each, where we needed only 4th field 
> in OpenMP runtime (for hostcall_buffer).
> 
> Offset based lookups like the following allows handling/exposing only 
> required fields across generations of ABI.
If we don't use it, just put it as `unused`. It's really hard to read as-is and 
it makes it more difficult to just zero fill.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D139730/new/

https://reviews.llvm.org/D139730

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D139730: [OpenMP][DeviceRTL][AMDGPU] Support code object version 5

2023-08-07 Thread Saiyedul Islam via Phabricator via cfe-commits
saiislam added a comment.

In D139730#4561622 , @arsenm wrote:

> In D139730#4561619 , @arsenm wrote:
>
>> In D139730#4561575 , @jhuber6 
>> wrote:
>>
>>> In D139730#4561573 , @arsenm 
>>> wrote:
>>>
 In D139730#4561540 , @jhuber6 
 wrote:

> Could you explain briefly what the approach here is? I'm confused as to 
> what's actually changed and how we're handling this difference. I thought 
> if this was just the definition of some builtin function we could just 
> rely on the backend to figure it out. Why do we need to know the code 
> object version inside the device RTL?

 The build is called in the device rtl, so the device RTL needs to contain 
 both implementations. The "backend figuring it out" is dead code 
 elimination
>>>
>>> Okay, do we expect to re-use this interface anywhere? If it's just for 
>>> OpenMP then we should probably copy the approach taken for 
>>> `__omp_rtl_debug_kind`, which is a global created on the GPU by 
>>> `CGOpenMPRuntimeGPU`'s constructor and does more or less the same thing.
>>
>> device libs replicates the same scheme using its own copy of an equivalent 
>> variable. Trying to merge those two together
>
> Although I guess that doesn't really need the builtin changes?

This builtin was already aware about cov4 and cov5. All this patch is changing 
is making it aware about a possibility where both needs to be present.
It is already used by device-libs, deviceRTL, and libc-gpu.
Also, encapsulating ABI related changes in implementation of the builtin allows 
other runtime developers to be agnostic to these lower level changes.




Comment at: clang/lib/CodeGen/CGBuiltin.cpp:17187-17188
+Address(Result, CGF.Int16Ty, CharUnits::fromQuantity(2)));
+  } else {
+if (Cov == clang::TargetOptions::COV_5) {
+  // Indexing the implicit kernarg segment.

jhuber6 wrote:
> nit.
There are a couple of common lines after the inner if-else, in the outer else 
section.



Comment at: openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp:1752
+// if (auto Err = preAllocateDeviceMemoryPool())
+//   return Err;
+

jhuber6 wrote:
> Leftoever?
No, it is not a left over.
One of the fields in cov5 implicitikernarg is heap_v1 ptr. It should point to a 
128KB zero-initialized block of coarse-grained memory on each device before 
launching the kernel. This code was working a while ago, but right now it is 
failing most likely due to some latest change in devicertl memory handling 
mechanism.
I need to debug it with this patch, otherwise it will cause all target region 
code calling device-malloc to fail.
I will try to fix it before the next revision.



Comment at: openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp:2542
+  /// Get the address of pointer to the preallocated device memory pool.
+  void **getPreAllocatedDeviceMemoryPool() {
+return &PreAllocatedDeviceMemoryPool;

jhuber6 wrote:
> Why do we need this? The current method shouldn't need to change if all we're 
> doing is allocating memory of greater size.
`PreAllocatedDeviceMemoryPool` is the pointer which stores the intermediate 
value before it is written to heap_v1_ptr field of cov5 implicitkernarg.



Comment at: openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp:3036
 
+  if (getImplicitArgsSize() < utils::COV5_SIZE) {
+DP("Setting fields of ImplicitArgs for COV4\n");

jhuber6 wrote:
> So we're required to emit some new arguments? I don't have any idea 
> what'schanged between this COV4 and COV5 stuff.
In cov5, we need to set certain fields of the implicit kernel arguments before 
launching the kernel.
Please see [[ 
https://llvm.org/docs/AMDGPUUsage.html#amdgpu-amdhsa-code-object-kernel-argument-metadata-map-table-v5
 | AMDHSA Code Object V5 Kernel Argument Metadata Map Additions and Changes]] 
for more details.

Only NumBlocks, NumThreads(XYZ), GridDims, and Heap_V1_ptr are relevant for us, 
so I have simplified code further.



Comment at: openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp:3037
+  if (getImplicitArgsSize() < utils::COV5_SIZE) {
+DP("Setting fields of ImplicitArgs for COV4\n");
+  } else {

arsenm wrote:
> This isn't doing anything?
Earlier we used to set hostcall_buffer here, but not anymore.
I have left the message in DP just for debug help.



Comment at: openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h:36
 
-// The implicit arguments of AMDGPU kernels.
-struct AMDGPUImplicitArgsTy {
-  uint64_t OffsetX;
-  uint64_t OffsetY;
-  uint64_t OffsetZ;
-  uint64_t HostcallPtr;
-  uint64_t Unused0;
-  uint64_t Unused1;
-  uint64_t U

[PATCH] D139730: [OpenMP][DeviceRTL][AMDGPU] Support code object version 5

2023-08-07 Thread Saiyedul Islam via Phabricator via cfe-commits
saiislam updated this revision to Diff 547751.
saiislam marked 5 inline comments as done.
saiislam added a comment.

Removed unused cov5 implicitargs fields.
Added comments about EmitAMDGPUWorkGroupSize and ABI-agnostica code emission.
Adressed reviewers' comments.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D139730/new/

https://reviews.llvm.org/D139730

Files:
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/lib/CodeGen/CodeGenModule.cpp
  clang/lib/CodeGen/CodeGenModule.h
  clang/lib/CodeGen/TargetInfo.h
  clang/lib/CodeGen/Targets/AMDGPU.cpp
  clang/lib/Driver/ToolChain.cpp
  openmp/libomptarget/DeviceRTL/CMakeLists.txt
  openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
  openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h

Index: openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
===
--- openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
+++ openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
@@ -25,6 +25,7 @@
 #include "llvm/Support/MemoryBufferRef.h"
 
 #include "llvm/Support/YAMLTraits.h"
+using namespace llvm::ELF;
 
 namespace llvm {
 namespace omp {
@@ -32,19 +33,35 @@
 namespace plugin {
 namespace utils {
 
-// The implicit arguments of AMDGPU kernels.
-struct AMDGPUImplicitArgsTy {
-  uint64_t OffsetX;
-  uint64_t OffsetY;
-  uint64_t OffsetZ;
-  uint64_t HostcallPtr;
-  uint64_t Unused0;
-  uint64_t Unused1;
-  uint64_t Unused2;
+enum IMPLICITARGS : uint32_t {
+  COV4_SIZE = 56,
+  COV5_SIZE = 256,
+
+  COV5_BLOCK_COUNT_X_OFFSET = 0,
+  COV5_BLOCK_COUNT_X_SIZE = 4,
+
+  COV5_GROUP_SIZE_X_OFFSET = 12,
+  COV5_GROUP_SIZE_X_SIZE = 2,
+
+  COV5_GROUP_SIZE_Y_OFFSET = 14,
+  COV5_GROUP_SIZE_Y_SIZE = 2,
+
+  COV5_GROUP_SIZE_Z_OFFSET = 16,
+  COV5_GROUP_SIZE_Z_SIZE = 2,
+
+  COV5_GRID_DIMS_OFFSET = 64,
+  COV5_GRID_DIMS_SIZE = 2,
+
+  COV5_HEAPV1_PTR_OFFSET = 96,
+  COV5_HEAPV1_PTR_SIZE = 8,
+
+  // 128 KB
+  PER_DEVICE_PREALLOC_SIZE = 131072
 };
 
-static_assert(sizeof(AMDGPUImplicitArgsTy) == 56,
-  "Unexpected size of implicit arguments");
+uint16_t getImplicitArgsSize(uint16_t Version) {
+  return Version < ELF::ELFABIVERSION_AMDGPU_HSA_V5 ? 56 : 256;
+}
 
 /// Parse a TargetID to get processor arch and feature map.
 /// Returns processor subarch.
@@ -295,7 +312,8 @@
 /// Reads the AMDGPU specific metadata from the ELF file and propagates the
 /// KernelInfoMap
 Error readAMDGPUMetaDataFromImage(MemoryBufferRef MemBuffer,
-  StringMap &KernelInfoMap) {
+  StringMap &KernelInfoMap,
+  uint16_t &ELFABIVersion) {
   Error Err = Error::success(); // Used later as out-parameter
 
   auto ELFOrError = object::ELF64LEFile::create(MemBuffer.getBuffer());
@@ -305,6 +323,12 @@
   const object::ELF64LEFile ELFObj = ELFOrError.get();
   ArrayRef Sections = cantFail(ELFObj.sections());
   KernelInfoReader Reader(KernelInfoMap);
+
+  // Read the code object version from ELF image header
+  auto Header = ELFObj.getHeader();
+  ELFABIVersion = (uint8_t)(Header.e_ident[ELF::EI_ABIVERSION]);
+  DP("ELFABIVERSION Version: %u\n", ELFABIVersion);
+
   for (const auto &S : Sections) {
 if (S.sh_type != ELF::SHT_NOTE)
   continue;
Index: openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
===
--- openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -253,6 +253,13 @@
 return Plugin::check(Status, "Error in hsa_amd_agents_allow_access: %s");
   }
 
+  Error zeroInitializeMemory(void *Ptr, size_t Size) {
+uint64_t Rounded = sizeof(uint32_t) * ((Size + 3) / sizeof(uint32_t));
+hsa_status_t Status =
+hsa_amd_memory_fill(Ptr, 0, Rounded / sizeof(uint32_t));
+return Plugin::check(Status, "Error in hsa_amd_memory_fill: %s");
+  }
+
   /// Get attribute from the memory pool.
   template 
   Error getAttr(hsa_amd_memory_pool_info_t Kind, Ty &Value) const {
@@ -381,6 +388,9 @@
   /// Get the executable.
   hsa_executable_t getExecutable() const { return Executable; }
 
+  /// Get to Code Object Version of the ELF
+  uint16_t getELFABIVersion() const { return ELFABIVersion; }
+
   /// Find an HSA device symbol by its name on the executable.
   Expected
   findDeviceSymbol(GenericDeviceTy &Device, StringRef SymbolName) const;
@@ -401,6 +411,7 @@
   hsa_executable_t Executable;
   hsa_code_object_t CodeObject;
   StringMap KernelInfoMap;
+  uint16_t ELFABIVersion;
 };
 
 /// Class implementing the AMDGPU kernel functionalities which derives from the
@@ -408,8 +419,7 @@
 struct AMDGPUKernelTy : public GenericKernelTy {
   /// Create an AMDGPU kernel with a name and an execution mode.
   AMDGPUKernelTy(const char *Name, OMPTgtExecModeFlags ExecutionMode)
-  : GenericKernelTy(Name, ExecutionMode),
-  

[PATCH] D139730: [OpenMP][DeviceRTL][AMDGPU] Support code object version 5

2023-08-04 Thread Matt Arsenault via Phabricator via cfe-commits
arsenm added a comment.

In D139730#4561619 , @arsenm wrote:

> In D139730#4561575 , @jhuber6 wrote:
>
>> In D139730#4561573 , @arsenm wrote:
>>
>>> In D139730#4561540 , @jhuber6 
>>> wrote:
>>>
 Could you explain briefly what the approach here is? I'm confused as to 
 what's actually changed and how we're handling this difference. I thought 
 if this was just the definition of some builtin function we could just 
 rely on the backend to figure it out. Why do we need to know the code 
 object version inside the device RTL?
>>>
>>> The build is called in the device rtl, so the device RTL needs to contain 
>>> both implementations. The "backend figuring it out" is dead code elimination
>>
>> Okay, do we expect to re-use this interface anywhere? If it's just for 
>> OpenMP then we should probably copy the approach taken for 
>> `__omp_rtl_debug_kind`, which is a global created on the GPU by 
>> `CGOpenMPRuntimeGPU`'s constructor and does more or less the same thing.
>
> device libs replicates the same scheme using its own copy of an equivalent 
> variable. Trying to merge those two together

Although I guess that doesn't really need the builtin changes?


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D139730/new/

https://reviews.llvm.org/D139730

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D139730: [OpenMP][DeviceRTL][AMDGPU] Support code object version 5

2023-08-04 Thread Matt Arsenault via Phabricator via cfe-commits
arsenm added a comment.

In D139730#4561575 , @jhuber6 wrote:

> In D139730#4561573 , @arsenm wrote:
>
>> In D139730#4561540 , @jhuber6 
>> wrote:
>>
>>> Could you explain briefly what the approach here is? I'm confused as to 
>>> what's actually changed and how we're handling this difference. I thought 
>>> if this was just the definition of some builtin function we could just rely 
>>> on the backend to figure it out. Why do we need to know the code object 
>>> version inside the device RTL?
>>
>> The build is called in the device rtl, so the device RTL needs to contain 
>> both implementations. The "backend figuring it out" is dead code elimination
>
> Okay, do we expect to re-use this interface anywhere? If it's just for OpenMP 
> then we should probably copy the approach taken for `__omp_rtl_debug_kind`, 
> which is a global created on the GPU by `CGOpenMPRuntimeGPU`'s constructor 
> and does more or less the same thing.

device libs replicates the same scheme using its own copy of an equivalent 
variable. Trying to merge those two together


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D139730/new/

https://reviews.llvm.org/D139730

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D139730: [OpenMP][DeviceRTL][AMDGPU] Support code object version 5

2023-08-04 Thread Joseph Huber via Phabricator via cfe-commits
jhuber6 added a comment.

In D139730#4561573 , @arsenm wrote:

> In D139730#4561540 , @jhuber6 wrote:
>
>> Could you explain briefly what the approach here is? I'm confused as to 
>> what's actually changed and how we're handling this difference. I thought if 
>> this was just the definition of some builtin function we could just rely on 
>> the backend to figure it out. Why do we need to know the code object version 
>> inside the device RTL?
>
> The build is called in the device rtl, so the device RTL needs to contain 
> both implementations. The "backend figuring it out" is dead code elimination

Okay, do we expect to re-use this interface anywhere? If it's just for OpenMP 
then we should probably copy the approach taken for `__omp_rtl_debug_kind`, 
which is a global created on the GPU by `CGOpenMPRuntimeGPU`'s constructor and 
does more or less the same thing.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D139730/new/

https://reviews.llvm.org/D139730

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D139730: [OpenMP][DeviceRTL][AMDGPU] Support code object version 5

2023-08-04 Thread Matt Arsenault via Phabricator via cfe-commits
arsenm added a comment.

In D139730#4561540 , @jhuber6 wrote:

> Could you explain briefly what the approach here is? I'm confused as to 
> what's actually changed and how we're handling this difference. I thought if 
> this was just the definition of some builtin function we could just rely on 
> the backend to figure it out. Why do we need to know the code object version 
> inside the device RTL?

The build is called in the device rtl, so the device RTL needs to contain both 
implementations. The "backend figuring it out" is dead code elimination


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D139730/new/

https://reviews.llvm.org/D139730

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D139730: [OpenMP][DeviceRTL][AMDGPU] Support code object version 5

2023-08-04 Thread Joseph Huber via Phabricator via cfe-commits
jhuber6 added inline comments.



Comment at: openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h:36
 
-// The implicit arguments of AMDGPU kernels.
-struct AMDGPUImplicitArgsTy {
-  uint64_t OffsetX;
-  uint64_t OffsetY;
-  uint64_t OffsetZ;
-  uint64_t HostcallPtr;
-  uint64_t Unused0;
-  uint64_t Unused1;
-  uint64_t Unused2;
+enum IMPLICITARGS : uint32_t {
+  COV4_SIZE = 56,

arsenm wrote:
> This is getting duplicated a few places, should it move to a support header?
> 
> I don't love the existing APIs for this, I think a struct definition makes 
> more sense
The other user here is my custom loader, @JonChesterfield has talked about 
wanting a common HSA helper header for awhile now.

I agree that the struct definition is much better. Being able to simply 
allocate this size and then zero fill it is much cleaner.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D139730/new/

https://reviews.llvm.org/D139730

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D139730: [OpenMP][DeviceRTL][AMDGPU] Support code object version 5

2023-08-04 Thread Joseph Huber via Phabricator via cfe-commits
jhuber6 added a comment.

Could you explain briefly what the approach here is? I'm confused as to what's 
actually changed and how we're handling this difference. I thought if this was 
just the definition of some builtin function we could just rely on the backend 
to figure it out. Why do we need to know the code object version inside the 
device RTL?




Comment at: clang/lib/CodeGen/CGBuiltin.cpp:17140
+
+  if (Cov == clang::TargetOptions::COV_None) {
+auto *ABIVersionC = CGF.CGM.GetOrCreateLLVMGlobal(

Could you explain the function of this in a comment? Are we emitting generic 
code if unspecified?



Comment at: clang/lib/CodeGen/CGBuiltin.cpp:17187-17188
+Address(Result, CGF.Int16Ty, CharUnits::fromQuantity(2)));
+  } else {
+if (Cov == clang::TargetOptions::COV_5) {
+  // Indexing the implicit kernarg segment.

nit.



Comment at: clang/lib/CodeGen/CGBuiltin.cpp:17194
+  // CGBuiltin.cpp ~ line 17052 ~ Value*EmitAMDGPUWorkGroupSize ~ COV:
+  // " << Cov ; llvm::errs().resetColor();
+} else {

Leftover debugging?



Comment at: clang/lib/Driver/ToolChain.cpp:1360
+if (A->getOption().matches(options::OPT_mcode_object_version_EQ))
+  DAL->append(A);
+

Shouldn't we be able to put this under the `OPT_m_group` below?



Comment at: openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp:1752
+// if (auto Err = preAllocateDeviceMemoryPool())
+//   return Err;
+

Leftoever?



Comment at: openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp:2542
+  /// Get the address of pointer to the preallocated device memory pool.
+  void **getPreAllocatedDeviceMemoryPool() {
+return &PreAllocatedDeviceMemoryPool;

Why do we need this? The current method shouldn't need to change if all we're 
doing is allocating memory of greater size.



Comment at: openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp:3036
 
+  if (getImplicitArgsSize() < utils::COV5_SIZE) {
+DP("Setting fields of ImplicitArgs for COV4\n");

So we're required to emit some new arguments? I don't have any idea 
what'schanged between this COV4 and COV5 stuff.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D139730/new/

https://reviews.llvm.org/D139730

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D139730: [OpenMP][DeviceRTL][AMDGPU] Support code object version 5

2023-08-04 Thread Matt Arsenault via Phabricator via cfe-commits
arsenm added inline comments.



Comment at: clang/lib/CodeGen/CGBuiltin.cpp:17146
+Value *ABIVersion;
+if (ABIVersionC) {
+  ABIVersion = CGF.Builder.CreateAlignedLoad(CGF.Int32Ty, ABIVersionC,

this must always pass



Comment at: openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp:3037
+  if (getImplicitArgsSize() < utils::COV5_SIZE) {
+DP("Setting fields of ImplicitArgs for COV4\n");
+  } else {

This isn't doing anything?



Comment at: openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h:36
 
-// The implicit arguments of AMDGPU kernels.
-struct AMDGPUImplicitArgsTy {
-  uint64_t OffsetX;
-  uint64_t OffsetY;
-  uint64_t OffsetZ;
-  uint64_t HostcallPtr;
-  uint64_t Unused0;
-  uint64_t Unused1;
-  uint64_t Unused2;
+enum IMPLICITARGS : uint32_t {
+  COV4_SIZE = 56,

This is getting duplicated a few places, should it move to a support header?

I don't love the existing APIs for this, I think a struct definition makes more 
sense


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D139730/new/

https://reviews.llvm.org/D139730

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D139730: [OpenMP][DeviceRTL][AMDGPU] Support code object version 5

2023-08-04 Thread Saiyedul Islam via Phabricator via cfe-commits
saiislam updated this revision to Diff 547297.
saiislam added a comment.

Another attempt at cov5 support by using CodeGen for 
buitlin_amdgpu_workgroup_size.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D139730/new/

https://reviews.llvm.org/D139730

Files:
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/lib/CodeGen/CodeGenModule.cpp
  clang/lib/CodeGen/CodeGenModule.h
  clang/lib/CodeGen/TargetInfo.h
  clang/lib/CodeGen/Targets/AMDGPU.cpp
  clang/lib/Driver/ToolChain.cpp
  openmp/libomptarget/DeviceRTL/CMakeLists.txt
  openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
  openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h

Index: openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
===
--- openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
+++ openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
@@ -25,6 +25,7 @@
 #include "llvm/Support/MemoryBufferRef.h"
 
 #include "llvm/Support/YAMLTraits.h"
+using namespace llvm::ELF;
 
 namespace llvm {
 namespace omp {
@@ -32,19 +33,55 @@
 namespace plugin {
 namespace utils {
 
-// The implicit arguments of AMDGPU kernels.
-struct AMDGPUImplicitArgsTy {
-  uint64_t OffsetX;
-  uint64_t OffsetY;
-  uint64_t OffsetZ;
-  uint64_t HostcallPtr;
-  uint64_t Unused0;
-  uint64_t Unused1;
-  uint64_t Unused2;
+enum IMPLICITARGS : uint32_t {
+  COV4_SIZE = 56,
+  COV4_HOSTCALL_PTR_OFFSET = 24,
+  HOSTCALL_PTR_SIZE = 8,
+
+  COV5_SIZE = 256,
+
+  COV5_BLOCK_COUNT_X_OFFSET = 0,
+  COV5_BLOCK_COUNT_X_SIZE = 4,
+
+  COV5_BLOCK_COUNT_Y_OFFSET = 4,
+  COV5_BLOCK_COUNT_Y_SIZE = 4,
+
+  COV5_BLOCK_COUNT_Z_OFFSET = 8,
+  COV5_BLOCK_COUNT_Z_SIZE = 4,
+
+  COV5_GROUP_SIZE_X_OFFSET = 12,
+  COV5_GROUP_SIZE_X_SIZE = 2,
+
+  COV5_GROUP_SIZE_Y_OFFSET = 14,
+  COV5_GROUP_SIZE_Y_SIZE = 2,
+
+  COV5_GROUP_SIZE_Z_OFFSET = 16,
+  COV5_GROUP_SIZE_Z_SIZE = 2,
+
+  COV5_REMAINDER_X_OFFSET = 18,
+  COV5_REMAINDER_X_SIZE = 2,
+
+  COV5_REMAINDER_Y_OFFSET = 20,
+  COV5_REMAINDER_Y_SIZE = 2,
+
+  COV5_REMAINDER_Z_OFFSET = 22,
+  COV5_REMAINDER_Z_SIZE = 2,
+
+  COV5_GRID_DIMS_OFFSET = 64,
+  COV5_GRID_DIMS_SIZE = 2,
+
+  COV5_HOSTCALL_PTR_OFFSET = 80,
+
+  COV5_HEAPV1_PTR_OFFSET = 96,
+  COV5_HEAPV1_PTR_SIZE = 8,
+
+  // 128 KB
+  PER_DEVICE_PREALLOC_SIZE = 131072
 };
 
-static_assert(sizeof(AMDGPUImplicitArgsTy) == 56,
-  "Unexpected size of implicit arguments");
+uint16_t getImplicitArgsSize(uint16_t Version) {
+  return Version < ELF::ELFABIVERSION_AMDGPU_HSA_V5 ? 56 : 256;
+}
 
 /// Parse a TargetID to get processor arch and feature map.
 /// Returns processor subarch.
@@ -295,7 +332,8 @@
 /// Reads the AMDGPU specific metadata from the ELF file and propagates the
 /// KernelInfoMap
 Error readAMDGPUMetaDataFromImage(MemoryBufferRef MemBuffer,
-  StringMap &KernelInfoMap) {
+  StringMap &KernelInfoMap,
+  uint16_t &ELFABIVersion) {
   Error Err = Error::success(); // Used later as out-parameter
 
   auto ELFOrError = object::ELF64LEFile::create(MemBuffer.getBuffer());
@@ -305,6 +343,12 @@
   const object::ELF64LEFile ELFObj = ELFOrError.get();
   ArrayRef Sections = cantFail(ELFObj.sections());
   KernelInfoReader Reader(KernelInfoMap);
+
+  // Read the code object version from ELF image header
+  auto Header = ELFObj.getHeader();
+  ELFABIVersion = (uint8_t)(Header.e_ident[ELF::EI_ABIVERSION]);
+  DP("ELFABIVERSION Version: %u\n", ELFABIVersion);
+
   for (const auto &S : Sections) {
 if (S.sh_type != ELF::SHT_NOTE)
   continue;
Index: openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
===
--- openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -253,6 +253,13 @@
 return Plugin::check(Status, "Error in hsa_amd_agents_allow_access: %s");
   }
 
+  Error zeroInitializeMemory(void *Ptr, size_t Size) {
+uint64_t Rounded = sizeof(uint32_t) * ((Size + 3) / sizeof(uint32_t));
+hsa_status_t Status =
+hsa_amd_memory_fill(Ptr, 0, Rounded / sizeof(uint32_t));
+return Plugin::check(Status, "Error in hsa_amd_memory_fill: %s");
+  }
+
   /// Get attribute from the memory pool.
   template 
   Error getAttr(hsa_amd_memory_pool_info_t Kind, Ty &Value) const {
@@ -381,6 +388,9 @@
   /// Get the executable.
   hsa_executable_t getExecutable() const { return Executable; }
 
+  /// Get to Code Object Version of the ELF
+  uint16_t getELFABIVersion() const { return ELFABIVersion; }
+
   /// Find an HSA device symbol by its name on the executable.
   Expected
   findDeviceSymbol(GenericDeviceTy &Device, StringRef SymbolName) const;
@@ -401,6 +411,7 @@
   hsa_executable_t Executable;
   hsa_code_object_t CodeObject;
   StringMap KernelInfoMap;
+  uint16_t ELFABIVersion;
 };
 
 /// Class implement

[PATCH] D139730: [OpenMP][DeviceRTL][AMDGPU] Support code object version 5

2023-06-13 Thread Joseph Huber via Phabricator via cfe-commits
jhuber6 added a comment.

In D139730#4418630 , @arsenm wrote:

> In D139730#3991628 , 
> @JonChesterfield wrote:
>
>> We currently require that library for libm, which I'm also not thrilled 
>> about, but at least you can currently build and run openmp programs (that 
>> don't use libm, like much of our tests) without it.
>
> The ABI isn't defined in terms of what device-libs does. It's fixed offsets 
> off of pointers accessible through amdgcn intrinsics. You can also just 
> directly emit the same IR, these functions aren't complicated

This is the suggestion I've talked with @saiislam about. I think we should just 
copy the magic intrinsics that are being queried here. I'm assuming we don't 
need to bother with supporting both v4 and v5 so we can just make the switch 
all at once.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D139730/new/

https://reviews.llvm.org/D139730

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D139730: [OpenMP][DeviceRTL][AMDGPU] Support code object version 5

2023-06-13 Thread Matt Arsenault via Phabricator via cfe-commits
arsenm added a comment.
Herald added subscribers: jplehr, sunshaoce.

In D139730#3991628 , @JonChesterfield 
wrote:

> We currently require that library for libm, which I'm also not thrilled 
> about, but at least you can currently build and run openmp programs (that 
> don't use libm, like much of our tests) without it.

The ABI isn't defined in terms of what device-libs does. It's fixed offsets off 
of pointers accessible through amdgcn intrinsics. You can also just directly 
emit the same IR, these functions aren't complicated


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D139730/new/

https://reviews.llvm.org/D139730

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D139730: [OpenMP][DeviceRTL][AMDGPU] Support code object version 5

2023-01-18 Thread Saiyedul Islam via Phabricator via cfe-commits
saiislam marked an inline comment as done.
saiislam added inline comments.



Comment at: clang/lib/Driver/ToolChains/Clang.cpp:7085
   if (Triple.isAMDGPU()) {
-handleAMDGPUCodeObjectVersionOptions(D, Args, CmdArgs);
+handleAMDGPUCodeObjectVersionOptions(D, C.getArgs(), CmdArgs,
+ /*IsCC1As=*/true);

yaxunl wrote:
> Any reason you need the original args? This will bypass the driver 
> translation, which should not in normal cases.
We need derived args to look for mcode-object-version. I have created a 
separate review for this change. Please have a look at D142022


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D139730/new/

https://reviews.llvm.org/D139730

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D139730: [OpenMP][DeviceRTL][AMDGPU] Support code object version 5

2023-01-04 Thread Saiyedul Islam via Phabricator via cfe-commits
saiislam added inline comments.



Comment at: openmp/libomptarget/DeviceRTL/src/Mapping.cpp:50
 
-uint32_t getNumHardwareThreadsInBlock() {
-  return __builtin_amdgcn_workgroup_size_x();
-}
+uint32_t getNumHardwareThreadsInBlock() { return external_get_local_size(0); }
 

If we still don't want to depend on rocm-device-libs then we will have to do 
something like (haven't tried this code yet):

```
uint32_t getNumHardwareThreadsInBlock() {
   if (__oclc_ABI_version < 500) {
  return __builtin_amdgcn_workgroup_size_x();
   } else {
  void *implicitArgPtr = __builtin_amdgcn_implicitarg_ptr();
  return (ushort)implicitArgPtr[6];
}
```



Comment at: openmp/libomptarget/DeviceRTL/src/Mapping.cpp:80
 
-uint32_t getNumberOfBlocks() {
-  return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
-}
+uint32_t getNumberOfBlocks() { return external_get_num_groups(0); }
 

```
uint32_t getNumberOfBlocks() {
   if (__oclc_ABI_version < 500) {
  return __builtin_amdgcn_grid_size_x() / 
__builtin_amdgcn_workgroup_size_x();
   } else {
  void *implicitArgPtr = __builtin_amdgcn_implicitarg_ptr();
  return (uint)implicitArgPtr[0];
}
```


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D139730/new/

https://reviews.llvm.org/D139730

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D139730: [OpenMP][DeviceRTL][AMDGPU] Support code object version 5

2023-01-04 Thread Saiyedul Islam via Phabricator via cfe-commits
saiislam added a comment.

Thanks everyone for your review and comments!
I am going to address all of them in a series of smaller patches starting with 
D140784 .


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D139730/new/

https://reviews.llvm.org/D139730

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D139730: [OpenMP][DeviceRTL][AMDGPU] Support code object version 5

2022-12-13 Thread Shilei Tian via Phabricator via cfe-commits
tianshilei1992 added inline comments.



Comment at: openmp/libomptarget/DeviceRTL/src/Mapping.cpp:19
 #pragma omp begin declare target device_type(nohost)
-
+extern const uint16_t __oclc_ABI_version;
 #include "llvm/Frontend/OpenMP/OMPGridValues.h"

jhuber6 wrote:
> What if this isn't defined? We should be able to use the OpenMP library 
> without the AMD device libraries. Should it be extern weak?
It should be put into AMD's `declare variant`.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D139730/new/

https://reviews.llvm.org/D139730

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D139730: [OpenMP][DeviceRTL][AMDGPU] Support code object version 5

2022-12-13 Thread Jon Chesterfield via Phabricator via cfe-commits
JonChesterfield added a comment.

I am reluctant to add the dependency edge to rocm device libs to openmp's GPU 
runtime.

We currently require that library for libm, which I'm also not thrilled about, 
but at least you can currently build and run openmp programs (that don't use 
libm, like much of our tests) without it.

My problem with device libs is that it usually doesn't build with trunk. It 
follows a rolling dev model tied to rocm clang and when upstream does something 
that takes a long time to apply, device libs doesn't build until rocm catches 
up. I've literally never managed to compile any branch of device libs with 
trunk clang without modifying the source, generally to delete directories that 
don't look necessary for libm.

Further, selecting an ABI based on runtime code found in a library which is 
hopefully constant folded is a weird layering choice. The compiler knows what 
ABI it is emitting code for, and that's how it picks files from device libs to 
effect that choice, but it would make far more sense to me for the compiler 
back end to set this stuff up itself.

Also, if we handle ABI in the back end, then we don't get the inevitable 
problem of rocm device libs and trunk clang having totally different ideas of 
what the ABI is as they drift in and out of sync.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D139730/new/

https://reviews.llvm.org/D139730

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D139730: [OpenMP][DeviceRTL][AMDGPU] Support code object version 5

2022-12-09 Thread Johannes Doerfert via Phabricator via cfe-commits
jdoerfert added a comment.

Could we elaborate on the benefits, please. Now we support two versions?

Why is this helpful:

> DeviceRTL uses rocm-device-libs instead of directly calling amdgcn builtins 
> for the functions which are affected by cov5.




Comment at: openmp/libomptarget/DeviceRTL/src/State.cpp:81
+#endif
+} // extern "C"
+

Why do we need the "external..." stuff anyway?



Comment at: openmp/libomptarget/plugins/amdgpu/src/rtl.cpp:2233
+  }
+}
+

What is this all about?


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D139730/new/

https://reviews.llvm.org/D139730

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D139730: [OpenMP][DeviceRTL][AMDGPU] Support code object version 5

2022-12-09 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl added inline comments.



Comment at: clang/lib/Driver/ToolChains/Clang.cpp:7085
   if (Triple.isAMDGPU()) {
-handleAMDGPUCodeObjectVersionOptions(D, Args, CmdArgs);
+handleAMDGPUCodeObjectVersionOptions(D, C.getArgs(), CmdArgs,
+ /*IsCC1As=*/true);

Any reason you need the original args? This will bypass the driver translation, 
which should not in normal cases.



Comment at: clang/lib/Driver/ToolChains/Clang.cpp:7086
+handleAMDGPUCodeObjectVersionOptions(D, C.getArgs(), CmdArgs,
+ /*IsCC1As=*/true);
 

clang -cc1 needs this to be default value false to emit code object version 
module flag


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D139730/new/

https://reviews.llvm.org/D139730

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D139730: [OpenMP][DeviceRTL][AMDGPU] Support code object version 5

2022-12-09 Thread Joseph Huber via Phabricator via cfe-commits
jhuber6 added a comment.

I'm not fully up-to-date, what's the main difference and advantage of the new 
code object version? What do all the new implicit arguments do.




Comment at: clang/lib/Driver/ToolChains/AMDGPU.cpp:953
   getAMDGPUCodeObjectVersion(getDriver(), DriverArgs));
+
   if (!RocmInstallation.checkCommonBitcodeLibs(CanonArch, LibDeviceFile,

Unrelated?



Comment at: openmp/libomptarget/DeviceRTL/include/Interface.h:169
+
+#ifdef __AMDGCN__
+size_t external_get_local_size(uint32_t dim);

This should probably use variants to match the rest of the style, also if you 
intend to read these outside of the library you'll need to put them in the 
exports file and set their visibility.



Comment at: openmp/libomptarget/DeviceRTL/src/Mapping.cpp:19
 #pragma omp begin declare target device_type(nohost)
-
+extern const uint16_t __oclc_ABI_version;
 #include "llvm/Frontend/OpenMP/OMPGridValues.h"

What if this isn't defined? We should be able to use the OpenMP library without 
the AMD device libraries. Should it be extern weak?



Comment at: openmp/libomptarget/DeviceRTL/src/State.cpp:73
+extern "C" {
+#ifdef __AMDGCN__
+size_t external_get_local_size(uint32_t dim) {

Variants



Comment at: openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.h:15
 
+enum IMPLICITARGS : uint16_t {
+  COV4_SIZE = 56,

Unrelated, but is there any particular reason these aren't defined in the 
`hsa_amd_ext.h`?


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D139730/new/

https://reviews.llvm.org/D139730

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D139730: [OpenMP][DeviceRTL][AMDGPU] Support code object version 5

2022-12-09 Thread Joseph Huber via Phabricator via cfe-commits
jhuber6 added a comment.

Maybe we should wait until D138389  lands and 
we can update both, otherwise we'd need a second patch.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D139730/new/

https://reviews.llvm.org/D139730

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D139730: [OpenMP][DeviceRTL][AMDGPU] Support code object version 5

2022-12-09 Thread Saiyedul Islam via Phabricator via cfe-commits
saiislam created this revision.
saiislam added reviewers: jdoerfert, JonChesterfield, jhuber6, yaxunl.
Herald added subscribers: kosarev, kerbowa, guansong, tpr, dstuttard, jvesely, 
kzhuravl.
Herald added a project: All.
saiislam requested review of this revision.
Herald added subscribers: openmp-commits, cfe-commits, sstefan1, MaskRay, wdng.
Herald added projects: clang, OpenMP.

Update DeviceRTL and the AMDGPU plugin to use code
object version 5. Default is code object version 4.

DeviceRTL uses rocm-device-libs instead of directly calling
amdgcn builtins for the functions which are affected by
cov5.

AMDGPU plugin queries the ELF for code object version
and then prepares various implicitargs accordingly.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D139730

Files:
  clang/lib/Driver/Driver.cpp
  clang/lib/Driver/ToolChains/AMDGPU.cpp
  clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
  clang/lib/Driver/ToolChains/AMDGPUOpenMP.h
  clang/lib/Driver/ToolChains/Clang.cpp
  openmp/libomptarget/DeviceRTL/include/Interface.h
  openmp/libomptarget/DeviceRTL/src/Mapping.cpp
  openmp/libomptarget/DeviceRTL/src/State.cpp
  openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.cpp
  openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.h
  openmp/libomptarget/plugins/amdgpu/impl/internal.h
  openmp/libomptarget/plugins/amdgpu/impl/system.cpp
  openmp/libomptarget/plugins/amdgpu/src/rtl.cpp

Index: openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
===
--- openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
+++ openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
@@ -124,9 +124,10 @@
   uint32_t KernargSegmentSize;
   void *KernargRegion = nullptr;
   std::queue FreeKernargSegments;
+  uint16_t CodeObjectVersion;
 
   uint32_t kernargSizeIncludingImplicit() {
-return KernargSegmentSize + sizeof(impl_implicit_args_t);
+return KernargSegmentSize + implicitArgsSize(CodeObjectVersion);
   }
 
   ~KernelArgPool() {
@@ -143,8 +144,10 @@
   KernelArgPool(const KernelArgPool &) = delete;
   KernelArgPool(KernelArgPool &&) = delete;
 
-  KernelArgPool(uint32_t KernargSegmentSize, hsa_amd_memory_pool_t &MemoryPool)
-  : KernargSegmentSize(KernargSegmentSize) {
+  KernelArgPool(uint32_t KernargSegmentSize, hsa_amd_memory_pool_t &MemoryPool,
+uint16_t CodeObjectVersion)
+  : KernargSegmentSize(KernargSegmentSize),
+CodeObjectVersion(CodeObjectVersion) {
 
 // impl uses one pool per kernel for all gpus, with a fixed upper size
 // preserving that exact scheme here, including the queue
@@ -228,16 +231,16 @@
   KernelTy(llvm::omp::OMPTgtExecModeFlags ExecutionMode, int16_t ConstWgSize,
int32_t DeviceId, void *CallStackAddr, const char *Name,
uint32_t KernargSegmentSize,
-   hsa_amd_memory_pool_t &KernArgMemoryPool)
+   hsa_amd_memory_pool_t &KernArgMemoryPool, uint16_t CodeObjectVersion)
   : ExecutionMode(ExecutionMode), ConstWGSize(ConstWgSize),
 DeviceId(DeviceId), CallStackAddr(CallStackAddr), Name(Name) {
 DP("Construct kernelinfo: ExecMode %d\n", ExecutionMode);
 
 std::string N(Name);
 if (KernelArgPoolMap.find(N) == KernelArgPoolMap.end()) {
-  KernelArgPoolMap.insert(
-  std::make_pair(N, std::unique_ptr(new KernelArgPool(
-KernargSegmentSize, KernArgMemoryPool;
+  KernelArgPoolMap.insert(std::make_pair(
+  N, std::unique_ptr(new KernelArgPool(
+ KernargSegmentSize, KernArgMemoryPool, CodeObjectVersion;
 }
   }
 };
@@ -474,6 +477,7 @@
   std::vector WarpSize;
   std::vector GPUName;
   std::vector TargetID;
+  uint16_t CodeObjectVersion;
 
   // OpenMP properties
   std::vector NumTeams;
@@ -487,6 +491,7 @@
 
   // Resource pools
   SignalPoolT FreeSignalPool;
+  std::vector PreallocatedDeviceHeap;
 
   bool HostcallRequired = false;
 
@@ -861,7 +866,6 @@
"Unexpected device id!");
 FuncGblEntries[DeviceId].emplace_back();
 FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
-// KernelArgPoolMap.clear();
 E.Entries.clear();
 E.Table.EntriesBegin = E.Table.EntriesEnd = 0;
   }
@@ -1032,6 +1036,7 @@
 SymbolInfoTable.resize(NumberOfDevices);
 DeviceCoarseGrainedMemoryPools.resize(NumberOfDevices);
 DeviceFineGrainedMemoryPools.resize(NumberOfDevices);
+PreallocatedDeviceHeap.resize(NumberOfDevices);
 
 Err = setupDevicePools(HSAAgents);
 if (Err != HSA_STATUS_SUCCESS) {
@@ -1361,6 +1366,27 @@
   return PacketId;
 }
 
+const uint16_t getCodeObjectVersionFromELF(__tgt_device_image *Image) {
+  char *ImageBegin = (char *)Image->ImageStart;
+  size_t ImageSize = (char *)Image->ImageEnd - ImageBegin;
+
+  StringRef Buffer = StringRef(ImageBegin, ImageSize);
+  auto ElfOrErr = ObjectFile::createELFObjectFile(MemoryBufferRef(Buffer, ""),
+  /*InitContent