[PATCH] D38978: [OpenMP] Enable the lowering of implicitly shared variables in OpenMP GPU-offloaded target regions to the GPU shared memory

2017-11-24 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea updated this revision to Diff 124243.
gtbercea added a comment.

Add regression tests and allow for shared memory lowering to be disabled at 
function level.


Repository:
  rL LLVM

https://reviews.llvm.org/D38978

Files:
  include/llvm/CodeGen/TargetPassConfig.h
  lib/CodeGen/TargetPassConfig.cpp
  lib/Target/NVPTX/CMakeLists.txt
  lib/Target/NVPTX/NVPTX.h
  lib/Target/NVPTX/NVPTXAsmPrinter.cpp
  lib/Target/NVPTX/NVPTXFrameLowering.cpp
  lib/Target/NVPTX/NVPTXFunctionDataSharing.cpp
  lib/Target/NVPTX/NVPTXFunctionDataSharing.h
  lib/Target/NVPTX/NVPTXInstrInfo.td
  lib/Target/NVPTX/NVPTXLowerAlloca.cpp
  lib/Target/NVPTX/NVPTXLowerSharedFrameIndicesPass.cpp
  lib/Target/NVPTX/NVPTXRegisterInfo.cpp
  lib/Target/NVPTX/NVPTXRegisterInfo.h
  lib/Target/NVPTX/NVPTXRegisterInfo.td
  lib/Target/NVPTX/NVPTXTargetMachine.cpp
  lib/Target/NVPTX/NVPTXUtilities.cpp
  lib/Target/NVPTX/NVPTXUtilities.h
  test/CodeGen/NVPTX/insert-shared-depot.ll
  test/CodeGen/NVPTX/lower-alloca-shared.ll
  test/CodeGen/NVPTX/no-shared-depot.ll
  test/CodeGen/NVPTX/nvptx-function-data-sharing.ll

Index: test/CodeGen/NVPTX/nvptx-function-data-sharing.ll
===
--- /dev/null
+++ test/CodeGen/NVPTX/nvptx-function-data-sharing.ll
@@ -0,0 +1,31 @@
+; RUN: opt < %s -S -nvptx-function-data-sharing -infer-address-spaces | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck %s --check-prefix PTX
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+target triple = "nvptx64-unknown-unknown"
+
+define void @kernel() #0 {
+; LABEL: @lower_shared_alloca
+; PTX-LABEL: .visible .entry kernel(
+  %A = alloca i32
+; CHECK: addrspacecast i32* %A to i32 addrspace(3)*
+; CHECK: addrspacecast i32 addrspace(3)* %A1 to i32*
+; CHECK: store i32 0, i32 addrspace(3)* {{%.+}}
+; PTX: add.u64 {{%rd[0-9]+}}, %SPS, 0;
+; PTX: cvta.to.shared.u64 {{%rd[0-9]+}}, {{%rd[0-9]+}};
+; PTX: st.shared.u32 [{{%rd[0-9]+}}], {{%r[0-9]+}}
+  %shared_args = alloca i32**
+  call void @callee(i32*** %shared_args)
+  %1 = load i32**, i32*** %shared_args
+  %2 = getelementptr inbounds i32*, i32** %1, i64 0
+  store i32* %A, i32** %2
+  store i32 0, i32* %A
+  ret void
+}
+
+declare void @callee(i32***)
+
+attributes #0 = {"has-nvptx-shared-depot"}
+
+!nvvm.annotations = !{!0}
+!0 = !{void ()* @kernel, !"kernel", i32 1}
Index: test/CodeGen/NVPTX/no-shared-depot.ll
===
--- /dev/null
+++ test/CodeGen/NVPTX/no-shared-depot.ll
@@ -0,0 +1,40 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=PTX32
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX64
+
+; PTX32: {{.*}}kernel()
+; PTX64: {{.*}}kernel()
+
+; PTX32: .local .align 8{{.*}}.b8{{.*}}__local_depot0
+; PTX64: .local .align 8{{.*}}.b8{{.*}}__local_depot0
+
+; PTX32-NOT: .shared .align 8{{.*}}.b8{{.*}}__shared_depot0
+; PTX64-NOT: .shared .align 8{{.*}}.b8{{.*}}__shared_depot0
+
+; PTX32-NOT: .reg .b32{{.*}}%SPS;
+; PTX64-NOT: .reg .b64{{.*}}%SPS;
+
+; PTX32-NOT: .reg .b32{{.*}}%SPSH;
+; PTX64-NOT: .reg .b64{{.*}}%SPSH;
+
+; PTX32-NOT: mov.u32{{.*}}%SPSH, __shared_depot0;
+; PTX64-NOT: mov.u64{{.*}}%SPSH, __shared_depot0;
+
+; PTX32-NOT: cvta.shared.u32{{.*}}%SPS, %SPSH;
+; PTX64-NOT: cvta.shared.u64{{.*}}%SPS, %SPSH;
+
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-unknown-unknown"
+
+define void @kernel() {
+; LABEL: @linsert_shared_depot
+  %A = alloca i32, align 4
+  %shared_args = alloca i8**, align 8
+  call void @callee(i8*** %shared_args)
+  store i32 10, i32* %A
+  ret void
+}
+
+declare void @callee(i8***)
+
+!nvvm.annotations = !{!0}
+!0 = !{void ()* @kernel, !"kernel", i32 1}
Index: test/CodeGen/NVPTX/lower-alloca-shared.ll
===
--- /dev/null
+++ test/CodeGen/NVPTX/lower-alloca-shared.ll
@@ -0,0 +1,31 @@
+; RUN: opt < %s -S -nvptx-lower-alloca -infer-address-spaces | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck %s --check-prefix PTX
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+target triple = "nvptx64-unknown-unknown"
+
+define void @kernel() #0 {
+; LABEL: @lower_shared_alloca
+; PTX-LABEL: .visible .entry kernel(
+  %A = alloca i32
+; CHECK: addrspacecast i32* %A to i32 addrspace(3)*
+; CHECK: addrspacecast i32 addrspace(3)* %1 to i32*
+; CHECK: store i32 0, i32 addrspace(3)* {{%.+}}
+; PTX: add.u64 {{%rd[0-9]+}}, %SPS, 0;
+; PTX: cvta.to.shared.u64 {{%rd[0-9]+}}, {{%rd[0-9]+}};
+; PTX: st.shared.u32 [{{%rd[0-9]+}}], {{%r[0-9]+}}
+  %shared_args = alloca i32**
+  call void @callee(i32*** %shared_args)
+  %1 = load i32**, i32*** %shared_args
+  %2 = getelementptr inbounds i32*, i32** 

[PATCH] D38978: [OpenMP] Enable the lowering of implicitly shared variables in OpenMP GPU-offloaded target regions to the GPU shared memory

2017-10-17 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea updated this revision to Diff 119327.
gtbercea added a comment.

Eliminate variable and function name clean-up. That has been moved into a 
separate patch: https://reviews.llvm.org/D39005


Repository:
  rL LLVM

https://reviews.llvm.org/D38978

Files:
  include/llvm/CodeGen/TargetPassConfig.h
  lib/CodeGen/TargetPassConfig.cpp
  lib/Target/NVPTX/CMakeLists.txt
  lib/Target/NVPTX/NVPTX.h
  lib/Target/NVPTX/NVPTXAsmPrinter.cpp
  lib/Target/NVPTX/NVPTXFrameLowering.cpp
  lib/Target/NVPTX/NVPTXFunctionDataSharing.cpp
  lib/Target/NVPTX/NVPTXFunctionDataSharing.h
  lib/Target/NVPTX/NVPTXInstrInfo.td
  lib/Target/NVPTX/NVPTXLowerAlloca.cpp
  lib/Target/NVPTX/NVPTXLowerSharedFrameIndicesPass.cpp
  lib/Target/NVPTX/NVPTXRegisterInfo.cpp
  lib/Target/NVPTX/NVPTXRegisterInfo.h
  lib/Target/NVPTX/NVPTXRegisterInfo.td
  lib/Target/NVPTX/NVPTXTargetMachine.cpp
  lib/Target/NVPTX/NVPTXUtilities.cpp
  lib/Target/NVPTX/NVPTXUtilities.h

Index: lib/Target/NVPTX/NVPTXUtilities.h
===
--- lib/Target/NVPTX/NVPTXUtilities.h
+++ lib/Target/NVPTX/NVPTXUtilities.h
@@ -14,6 +14,8 @@
 #ifndef LLVM_LIB_TARGET_NVPTX_NVPTXUTILITIES_H
 #define LLVM_LIB_TARGET_NVPTX_NVPTXUTILITIES_H
 
+#include "NVPTXTargetMachine.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -60,6 +62,8 @@
 bool getAlign(const Function &, unsigned index, unsigned &);
 bool getAlign(const CallInst &, unsigned index, unsigned &);
 
+bool ptrIsStored(Value *Ptr);
+
 }
 
 #endif
Index: lib/Target/NVPTX/NVPTXUtilities.cpp
===
--- lib/Target/NVPTX/NVPTXUtilities.cpp
+++ lib/Target/NVPTX/NVPTXUtilities.cpp
@@ -28,6 +28,8 @@
 
 namespace llvm {
 
+#define DEBUG_TYPE "nvptx-utilities"
+
 namespace {
 typedef std::map key_val_pair_t;
 typedef std::map global_val_annot_t;
@@ -314,4 +316,50 @@
   return false;
 }
 
+/// Returns true if there are any instructions storing
+/// the address of this pointer.
+bool ptrIsStored(Value *Ptr) {
+  SmallVector PointerAliases;
+  PointerAliases.push_back(Ptr);
+
+  SmallVector Users;
+  for (const Use  : Ptr->uses())
+Users.push_back(U.getUser());
+
+  for (unsigned I = 0; I < Users.size(); ++I) {
+// Get pointer usage
+const User *FU = Users[I];
+
+// Check if Ptr or an alias to it is the destination of the store
+auto SI = dyn_cast(FU);
+if (SI) {
+  for (auto Alias: PointerAliases)
+if (SI->getValueOperand() == Alias)
+  return true;
+  continue;
+}
+
+// TODO: Can loads lead to address being taken?
+// TODO: Can GEPs lead to address being taken?
+
+// Bitcasts increase aliases of the pointer
+auto BI = dyn_cast(FU);
+if (BI) {
+  for (const Use  : BI->uses())
+Users.push_back(U.getUser());
+  PointerAliases.push_back(BI);
+  continue;
+}
+
+// TODO:
+// There may be other instructions which increase the number
+// of alias values ex. operations on the address of the alloca.
+// The whole alloca'ed memory region needs to be shared if at
+// least one of the values needs to be shared.
+  }
+
+  // Address of the pointer has been stored
+  return false;
+}
+
 } // namespace llvm
Index: lib/Target/NVPTX/NVPTXTargetMachine.cpp
===
--- lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -54,6 +54,7 @@
 void initializeNVPTXLowerAggrCopiesPass(PassRegistry &);
 void initializeNVPTXLowerArgsPass(PassRegistry &);
 void initializeNVPTXLowerAllocaPass(PassRegistry &);
+void initializeNVPTXFunctionDataSharingPass(PassRegistry &);
 
 } // end namespace llvm
 
@@ -72,6 +73,7 @@
   initializeNVPTXAssignValidGlobalNamesPass(PR);
   initializeNVPTXLowerArgsPass(PR);
   initializeNVPTXLowerAllocaPass(PR);
+  initializeNVPTXFunctionDataSharingPass(PR);
   initializeNVPTXLowerAggrCopiesPass(PR);
 }
 
@@ -148,6 +150,7 @@
   bool addInstSelector() override;
   void addPostRegAlloc() override;
   void addMachineSSAOptimization() override;
+  void addMachineSSALowering() override;
 
   FunctionPass *createTargetRegisterAllocator(bool) override;
   void addFastRegAlloc(FunctionPass *RegAllocPass) override;
@@ -248,10 +251,15 @@
   // before the address space inference passes.
   addPass(createNVPTXLowerArgsPass(()));
   if (getOptLevel() != CodeGenOpt::None) {
+// Add address space inference passes
 addAddressSpaceInferencePasses();
 if (!DisableLoadStoreVectorizer)
   addPass(createLoadStoreVectorizerPass());
 addStraightLineScalarOptimizationPasses();
+  } else {
+// Even when no optimizations are used, we need to lower certain
+// alloca instructions to the appropriate memory type for correctness.
+addPass(createNVPTXFunctionDataSharingPass(()));
   }
 

[PATCH] D38978: [OpenMP] Enable the lowering of implicitly shared variables in OpenMP GPU-offloaded target regions to the GPU shared memory

2017-10-16 Thread Artem Belevich via Phabricator via cfe-commits
tra added a comment.

Please add tests for the cases where such local->shaed conversion should and 
should not happen.
I would appreciate if you could add details on what exactly your passes are 
supposed to move to shared memory.

Considering that device-side code tends to be heavily inlined, it may be 
prudent to add an option to control the total size of shared memory we allow to 
be used for this purpose.

In case your passes are not executed (or didn't move anything to shared 
memory), is there any impact on the generated PTX. I.e. can ptxas successfully 
optimize unused shared memory away?

If the code intentionally wants to allocate something in local memory, would 
the allocation ever be moved to shared memory by your pass? If so, how would I 
prevent that?




Comment at: lib/Target/NVPTX/NVPTXAsmPrinter.cpp:1749
+O << "\t.reg .b32 \t%SHSP;\n";
+O << "\t.reg .b32 \t%SHSPL;\n";
+  }

Nit: the name should end with `S` as the L in `SPL` was for 'local' address 
space. which then gets converted to generic AS. In your case it will be in 
shared space, hence S would be more appropriate.



Comment at: lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp:68
 
+void NVPTXAssignValidGlobalNames::generateCleanName(Value ) {
+  std::string ValidName;

The name cleanup changes in this file should probably be committed by 
themselves as they have nothing to do with the rest of the patch.



Comment at: lib/Target/NVPTX/NVPTXFunctionDataSharing.cpp:9
+//===--===//
+//
+//

Please add details about what the pass is supposed to do.


Repository:
  rL LLVM

https://reviews.llvm.org/D38978



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D38978: [OpenMP] Enable the lowering of implicitly shared variables in OpenMP GPU-offloaded target regions to the GPU shared memory

2017-10-16 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea created this revision.
Herald added subscribers: mgorny, jholewinski.

This patch is part of the development effort to add support in the current 
OpenMP GPU offloading implementation for implicitly sharing variables between a 
target region executed by the team master thread and the worker threads within 
that team.

This patch is the second of three required for successfully performing the 
implicit sharing of master thread variables with the worker threads within a 
team:
-Patch https://reviews.llvm.org/D38976 extends the CLANG code generation with 
code that handles shared variables.
-Patch (coming soon) extends the functionality of libomptarget to maintain a 
list of references to shared variables.

This patch adds a shared memory stack to the prolog of the kernel function 
representing the device offloaded OpenMP target region. The new passes along 
with the changes to existing ones, ensure that any OpenMP variable which needs 
to be shared across several threads will be allocated in this new stack, in the 
shared memory of the device. This patch covers the case of sharing variables 
from the master thread to the worker threads:

  #pragma omp target
  {
 // master thread only
 int v;
 #pragma omp parallel
 {
// worker threads
// use v
 }
  }


Repository:
  rL LLVM

https://reviews.llvm.org/D38978

Files:
  include/llvm/CodeGen/TargetPassConfig.h
  lib/CodeGen/TargetPassConfig.cpp
  lib/Target/NVPTX/CMakeLists.txt
  lib/Target/NVPTX/NVPTX.h
  lib/Target/NVPTX/NVPTXAsmPrinter.cpp
  lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp
  lib/Target/NVPTX/NVPTXFrameLowering.cpp
  lib/Target/NVPTX/NVPTXFunctionDataSharing.cpp
  lib/Target/NVPTX/NVPTXFunctionDataSharing.h
  lib/Target/NVPTX/NVPTXInstrInfo.td
  lib/Target/NVPTX/NVPTXLowerAlloca.cpp
  lib/Target/NVPTX/NVPTXLowerSharedFrameIndicesPass.cpp
  lib/Target/NVPTX/NVPTXRegisterInfo.cpp
  lib/Target/NVPTX/NVPTXRegisterInfo.h
  lib/Target/NVPTX/NVPTXRegisterInfo.td
  lib/Target/NVPTX/NVPTXTargetMachine.cpp
  lib/Target/NVPTX/NVPTXUtilities.cpp
  lib/Target/NVPTX/NVPTXUtilities.h

Index: lib/Target/NVPTX/NVPTXUtilities.h
===
--- lib/Target/NVPTX/NVPTXUtilities.h
+++ lib/Target/NVPTX/NVPTXUtilities.h
@@ -14,6 +14,8 @@
 #ifndef LLVM_LIB_TARGET_NVPTX_NVPTXUTILITIES_H
 #define LLVM_LIB_TARGET_NVPTX_NVPTXUTILITIES_H
 
+#include "NVPTXTargetMachine.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -60,6 +62,8 @@
 bool getAlign(const Function &, unsigned index, unsigned &);
 bool getAlign(const CallInst &, unsigned index, unsigned &);
 
+bool ptrIsStored(Value *Ptr);
+
 }
 
 #endif
Index: lib/Target/NVPTX/NVPTXUtilities.cpp
===
--- lib/Target/NVPTX/NVPTXUtilities.cpp
+++ lib/Target/NVPTX/NVPTXUtilities.cpp
@@ -28,6 +28,8 @@
 
 namespace llvm {
 
+#define DEBUG_TYPE "nvptx-utilities"
+
 namespace {
 typedef std::map key_val_pair_t;
 typedef std::map global_val_annot_t;
@@ -314,4 +316,50 @@
   return false;
 }
 
+/// Returns true if there are any instructions storing
+/// the address of this pointer.
+bool ptrIsStored(Value *Ptr) {
+  SmallVector PointerAliases;
+  PointerAliases.push_back(Ptr);
+
+  SmallVector Users;
+  for (const Use  : Ptr->uses())
+Users.push_back(U.getUser());
+
+  for (unsigned I = 0; I < Users.size(); ++I) {
+// Get pointer usage
+const User *FU = Users[I];
+
+// Check if Ptr or an alias to it is the destination of the store
+auto SI = dyn_cast(FU);
+if (SI) {
+  for (auto Alias: PointerAliases)
+if (SI->getValueOperand() == Alias)
+  return true;
+  continue;
+}
+
+// TODO: Can loads lead to address being taken?
+// TODO: Can GEPs lead to address being taken?
+
+// Bitcasts increase aliases of the pointer
+auto BI = dyn_cast(FU);
+if (BI) {
+  for (const Use  : BI->uses())
+Users.push_back(U.getUser());
+  PointerAliases.push_back(BI);
+  continue;
+}
+
+// TODO:
+// There may be other instructions which increase the number
+// of alias values ex. operations on the address of the alloca.
+// The whole alloca'ed memory region needs to be shared if at
+// least one of the values needs to be shared.
+  }
+
+  // Address of the pointer has been stored
+  return false;
+}
+
 } // namespace llvm
Index: lib/Target/NVPTX/NVPTXTargetMachine.cpp
===
--- lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -54,6 +54,7 @@
 void initializeNVPTXLowerAggrCopiesPass(PassRegistry &);
 void initializeNVPTXLowerArgsPass(PassRegistry &);
 void initializeNVPTXLowerAllocaPass(PassRegistry &);
+void initializeNVPTXFunctionDataSharingPass(PassRegistry