[PATCH] D53141: [OpenMP][libomptarget] Add runtime function for pushing coalesced global records

2018-11-01 Thread Phabricator via Phabricator via cfe-commits
This revision was automatically updated to reflect the committed changes.
Closed by commit rOMP345867: [OpenMP][libomptarget] Add runtime function for 
pushing coalesced global records (authored by gbercea, committed by ).

Changed prior to commit:
  https://reviews.llvm.org/D53141?vs=172007&id=172184#toc

Repository:
  rOMP OpenMP

https://reviews.llvm.org/D53141

Files:
  libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
  libomptarget/deviceRTLs/nvptx/src/interface.h
  libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
  libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
  libomptarget/deviceRTLs/nvptx/src/supporti.h

Index: libomptarget/deviceRTLs/nvptx/src/interface.h
===
--- libomptarget/deviceRTLs/nvptx/src/interface.h
+++ libomptarget/deviceRTLs/nvptx/src/interface.h
@@ -478,6 +478,8 @@
 
 EXTERN void __kmpc_data_sharing_init_stack();
 EXTERN void __kmpc_data_sharing_init_stack_spmd();
+EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t size,
+int16_t UseSharedMemory);
 EXTERN void *__kmpc_data_sharing_push_stack(size_t size, int16_t UseSharedMemory);
 EXTERN void __kmpc_data_sharing_pop_stack(void *a);
 EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs);
Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
===
--- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
+++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
@@ -40,8 +40,6 @@
 INLINE unsigned smid() {
   unsigned id;
   asm("mov.u32 %0, %%smid;" : "=r"(id));
-  ASSERT0(LT_FUSSY, nsmid() <= MAX_SM,
-  "Expected number of SMs is less than reported.");
   return id;
 }
 
Index: libomptarget/deviceRTLs/nvptx/src/supporti.h
===
--- libomptarget/deviceRTLs/nvptx/src/supporti.h
+++ libomptarget/deviceRTLs/nvptx/src/supporti.h
@@ -188,7 +188,6 @@
 {
   void *ptr = malloc(size);
   PRINT(LD_MEM, "malloc data of size %zu for %s: 0x%llx\n", size, msg, P64(ptr));
-  ASSERT(LT_SAFETY, ptr, "failed to allocate %zu bytes for %s\n", size, msg);
   return ptr;
 }
 
Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
===
--- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
+++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
@@ -123,7 +123,7 @@
 struct DataSharingStateTy {
   __kmpc_data_sharing_slot *SlotPtr[DS_Max_Warp_Number];
   void *StackPtr[DS_Max_Warp_Number];
-  void *FramePtr[DS_Max_Warp_Number];
+  void * volatile FramePtr[DS_Max_Warp_Number];
   int32_t ActiveThreads[DS_Max_Warp_Number];
 };
 // Additional worker slot type which is initialized with the default worker slot
Index: libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
===
--- libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
+++ libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
@@ -129,7 +129,7 @@
 
   __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
   void *&StackP = DataSharingState.StackPtr[WID];
-  void *&FrameP = DataSharingState.FramePtr[WID];
+  void * volatile &FrameP = DataSharingState.FramePtr[WID];
   int32_t &ActiveT = DataSharingState.ActiveThreads[WID];
 
   DSPRINT0(DSFLAG, "Save current slot/stack values.\n");
@@ -283,7 +283,7 @@
 
   __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
   void *&StackP = DataSharingState.StackPtr[WID];
-  void *&FrameP = DataSharingState.FramePtr[WID];
+  void * volatile &FrameP = DataSharingState.FramePtr[WID];
 
   SlotP = *SavedSharedSlot;
   StackP = *SavedSharedStack;
@@ -321,7 +321,7 @@
 
   DSPRINT(DSFLAG, "Source  warp: %d\n", SourceWID);
 
-  void *P = DataSharingState.FramePtr[SourceWID];
+  void * volatile P = DataSharingState.FramePtr[SourceWID];
   DSPRINT0(DSFLAG, "Exiting __kmpc_get_data_sharing_environment_frame\n");
   return P;
 }
@@ -369,47 +369,31 @@
   __threadfence_block();
 }
 
-// Called at the time of the kernel initialization. This is used to initilize
-// the list of references to shared variables and to pre-allocate global storage
-// for holding the globalized variables.
-//
-// By default the globalized variables are stored in global memory. If the
-// UseSharedMemory is set to true, the runtime will attempt to use shared memory
-// as long as the size requested fits the pre-allocated size.
-EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize,
-int16_t UseSharedMemory) {
+INLINE void* data_sharing_push_stack_common(size_t PushSize) {
   if (isRuntimeUninitialized()) {
 ASSERT0(LT_FUSSY, isSPMDMode(),
 "Expected SPMD mode with uninitialized runtime.");
-return omptarget_nvptx_SimpleThreadPrivateContext::Allocate(DataSize);
+return omptarget_nvptx_SimpleThreadPrivateContext::Allocate(PushSize);
   }
 
+  // Only warp active maste

[PATCH] D53141: [OpenMP][libomptarget] Add runtime function for pushing coalesced global records

2018-11-01 Thread Alexey Bataev via Phabricator via cfe-commits
ABataev accepted this revision.
ABataev added a comment.

LG


Repository:
  rOMP OpenMP

https://reviews.llvm.org/D53141



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D53141: [OpenMP][libomptarget] Add runtime function for pushing coalesced global records

2018-10-31 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea marked an inline comment as done.
gtbercea added inline comments.



Comment at: libomptarget/deviceRTLs/nvptx/src/supporti.h:191
   PRINT(LD_MEM, "malloc data of size %zu for %s: 0x%llx\n", size, msg, 
P64(ptr));
-  ASSERT(LT_SAFETY, ptr, "failed to allocate %zu bytes for %s\n", size, msg);
   return ptr;

ABataev wrote:
> Same, do you really need to remove this?
Yes, leaving this in will lead to the whole malloc to be optimized out after 
inlining.


Repository:
  rOMP OpenMP

https://reviews.llvm.org/D53141



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D53141: [OpenMP][libomptarget] Add runtime function for pushing coalesced global records

2018-10-31 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea updated this revision to Diff 172007.
gtbercea added a comment.

  Reinstate assert.


Repository:
  rOMP OpenMP

https://reviews.llvm.org/D53141

Files:
  libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
  libomptarget/deviceRTLs/nvptx/src/interface.h
  libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
  libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
  libomptarget/deviceRTLs/nvptx/src/supporti.h

Index: libomptarget/deviceRTLs/nvptx/src/supporti.h
===
--- libomptarget/deviceRTLs/nvptx/src/supporti.h
+++ libomptarget/deviceRTLs/nvptx/src/supporti.h
@@ -188,7 +188,6 @@
 {
   void *ptr = malloc(size);
   PRINT(LD_MEM, "malloc data of size %zu for %s: 0x%llx\n", size, msg, P64(ptr));
-  ASSERT(LT_SAFETY, ptr, "failed to allocate %zu bytes for %s\n", size, msg);
   return ptr;
 }
 
Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
===
--- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
+++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
@@ -123,7 +123,7 @@
 struct DataSharingStateTy {
   __kmpc_data_sharing_slot *SlotPtr[DS_Max_Warp_Number];
   void *StackPtr[DS_Max_Warp_Number];
-  void *FramePtr[DS_Max_Warp_Number];
+  void * volatile FramePtr[DS_Max_Warp_Number];
   int32_t ActiveThreads[DS_Max_Warp_Number];
 };
 // Additional worker slot type which is initialized with the default worker slot
Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
===
--- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
+++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
@@ -40,8 +40,6 @@
 INLINE unsigned smid() {
   unsigned id;
   asm("mov.u32 %0, %%smid;" : "=r"(id));
-  ASSERT0(LT_FUSSY, nsmid() <= MAX_SM,
-  "Expected number of SMs is less than reported.");
   return id;
 }
 
Index: libomptarget/deviceRTLs/nvptx/src/interface.h
===
--- libomptarget/deviceRTLs/nvptx/src/interface.h
+++ libomptarget/deviceRTLs/nvptx/src/interface.h
@@ -478,6 +478,8 @@
 
 EXTERN void __kmpc_data_sharing_init_stack();
 EXTERN void __kmpc_data_sharing_init_stack_spmd();
+EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t size,
+int16_t UseSharedMemory);
 EXTERN void *__kmpc_data_sharing_push_stack(size_t size, int16_t UseSharedMemory);
 EXTERN void __kmpc_data_sharing_pop_stack(void *a);
 EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs);
Index: libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
===
--- libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
+++ libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
@@ -129,7 +129,7 @@
 
   __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
   void *&StackP = DataSharingState.StackPtr[WID];
-  void *&FrameP = DataSharingState.FramePtr[WID];
+  void * volatile &FrameP = DataSharingState.FramePtr[WID];
   int32_t &ActiveT = DataSharingState.ActiveThreads[WID];
 
   DSPRINT0(DSFLAG, "Save current slot/stack values.\n");
@@ -283,7 +283,7 @@
 
   __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
   void *&StackP = DataSharingState.StackPtr[WID];
-  void *&FrameP = DataSharingState.FramePtr[WID];
+  void * volatile &FrameP = DataSharingState.FramePtr[WID];
 
   SlotP = *SavedSharedSlot;
   StackP = *SavedSharedStack;
@@ -321,7 +321,7 @@
 
   DSPRINT(DSFLAG, "Source  warp: %d\n", SourceWID);
 
-  void *P = DataSharingState.FramePtr[SourceWID];
+  void * volatile P = DataSharingState.FramePtr[SourceWID];
   DSPRINT0(DSFLAG, "Exiting __kmpc_get_data_sharing_environment_frame\n");
   return P;
 }
@@ -369,47 +369,31 @@
   __threadfence_block();
 }
 
-// Called at the time of the kernel initialization. This is used to initilize
-// the list of references to shared variables and to pre-allocate global storage
-// for holding the globalized variables.
-//
-// By default the globalized variables are stored in global memory. If the
-// UseSharedMemory is set to true, the runtime will attempt to use shared memory
-// as long as the size requested fits the pre-allocated size.
-EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize,
-int16_t UseSharedMemory) {
+INLINE void* data_sharing_push_stack_common(size_t PushSize) {
   if (isRuntimeUninitialized()) {
 ASSERT0(LT_FUSSY, isSPMDMode(),
 "Expected SPMD mode with uninitialized runtime.");
-return omptarget_nvptx_SimpleThreadPrivateContext::Allocate(DataSize);
+return omptarget_nvptx_SimpleThreadPrivateContext::Allocate(PushSize);
   }
 
+  // Only warp active master threads manage the stack.
+  bool IsWarpMaster = (getThreadId() % WARPSIZE) == 0;
+
   // Add worst-case padding to DataSize so that future stack allocations are
   // correctly aligned.
   const size_t Alignme

[PATCH] D53141: [OpenMP][libomptarget] Add runtime function for pushing coalesced global records

2018-10-31 Thread Alexey Bataev via Phabricator via cfe-commits
ABataev added inline comments.



Comment at: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu:159
   omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId);
-  ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr");
   newTaskDescr->InitLevelOneTaskDescr(ThreadLimit,

Do you really need to remove this?



Comment at: libomptarget/deviceRTLs/nvptx/src/supporti.h:191
   PRINT(LD_MEM, "malloc data of size %zu for %s: 0x%llx\n", size, msg, 
P64(ptr));
-  ASSERT(LT_SAFETY, ptr, "failed to allocate %zu bytes for %s\n", size, msg);
   return ptr;

Same, do you really need to remove this?


Repository:
  rOMP OpenMP

https://reviews.llvm.org/D53141



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D53141: [OpenMP][libomptarget] Add runtime function for pushing coalesced global records

2018-10-31 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea updated this revision to Diff 171993.
gtbercea added a comment.

Move while on else branch.


Repository:
  rOMP OpenMP

https://reviews.llvm.org/D53141

Files:
  libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
  libomptarget/deviceRTLs/nvptx/src/interface.h
  libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
  libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
  libomptarget/deviceRTLs/nvptx/src/supporti.h

Index: libomptarget/deviceRTLs/nvptx/src/supporti.h
===
--- libomptarget/deviceRTLs/nvptx/src/supporti.h
+++ libomptarget/deviceRTLs/nvptx/src/supporti.h
@@ -188,7 +188,6 @@
 {
   void *ptr = malloc(size);
   PRINT(LD_MEM, "malloc data of size %zu for %s: 0x%llx\n", size, msg, P64(ptr));
-  ASSERT(LT_SAFETY, ptr, "failed to allocate %zu bytes for %s\n", size, msg);
   return ptr;
 }
 
Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
===
--- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
+++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
@@ -123,7 +123,7 @@
 struct DataSharingStateTy {
   __kmpc_data_sharing_slot *SlotPtr[DS_Max_Warp_Number];
   void *StackPtr[DS_Max_Warp_Number];
-  void *FramePtr[DS_Max_Warp_Number];
+  void * volatile FramePtr[DS_Max_Warp_Number];
   int32_t ActiveThreads[DS_Max_Warp_Number];
 };
 // Additional worker slot type which is initialized with the default worker slot
Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
===
--- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
+++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
@@ -40,8 +40,6 @@
 INLINE unsigned smid() {
   unsigned id;
   asm("mov.u32 %0, %%smid;" : "=r"(id));
-  ASSERT0(LT_FUSSY, nsmid() <= MAX_SM,
-  "Expected number of SMs is less than reported.");
   return id;
 }
 
@@ -156,7 +154,6 @@
   //
   omptarget_nvptx_TaskDescr *newTaskDescr =
   omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId);
-  ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr");
   newTaskDescr->InitLevelOneTaskDescr(ThreadLimit,
   currTeamDescr.LevelZeroTaskDescr());
   newTaskDescr->ThreadLimit() = ThreadLimit;
Index: libomptarget/deviceRTLs/nvptx/src/interface.h
===
--- libomptarget/deviceRTLs/nvptx/src/interface.h
+++ libomptarget/deviceRTLs/nvptx/src/interface.h
@@ -478,6 +478,8 @@
 
 EXTERN void __kmpc_data_sharing_init_stack();
 EXTERN void __kmpc_data_sharing_init_stack_spmd();
+EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t size,
+int16_t UseSharedMemory);
 EXTERN void *__kmpc_data_sharing_push_stack(size_t size, int16_t UseSharedMemory);
 EXTERN void __kmpc_data_sharing_pop_stack(void *a);
 EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs);
Index: libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
===
--- libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
+++ libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
@@ -129,7 +129,7 @@
 
   __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
   void *&StackP = DataSharingState.StackPtr[WID];
-  void *&FrameP = DataSharingState.FramePtr[WID];
+  void * volatile &FrameP = DataSharingState.FramePtr[WID];
   int32_t &ActiveT = DataSharingState.ActiveThreads[WID];
 
   DSPRINT0(DSFLAG, "Save current slot/stack values.\n");
@@ -283,7 +283,7 @@
 
   __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
   void *&StackP = DataSharingState.StackPtr[WID];
-  void *&FrameP = DataSharingState.FramePtr[WID];
+  void * volatile &FrameP = DataSharingState.FramePtr[WID];
 
   SlotP = *SavedSharedSlot;
   StackP = *SavedSharedStack;
@@ -321,7 +321,7 @@
 
   DSPRINT(DSFLAG, "Source  warp: %d\n", SourceWID);
 
-  void *P = DataSharingState.FramePtr[SourceWID];
+  void * volatile P = DataSharingState.FramePtr[SourceWID];
   DSPRINT0(DSFLAG, "Exiting __kmpc_get_data_sharing_environment_frame\n");
   return P;
 }
@@ -369,47 +369,31 @@
   __threadfence_block();
 }
 
-// Called at the time of the kernel initialization. This is used to initilize
-// the list of references to shared variables and to pre-allocate global storage
-// for holding the globalized variables.
-//
-// By default the globalized variables are stored in global memory. If the
-// UseSharedMemory is set to true, the runtime will attempt to use shared memory
-// as long as the size requested fits the pre-allocated size.
-EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize,
-int16_t UseSharedMemory) {
+INLINE void* data_sharing_push_stack_common(size_t PushSize) {
   if (isRuntimeUninitialized()) {
 ASSERT0(LT_FUSSY, isSPMDMode(),
 "Expected SPMD mode with uninitialized runtime.");
-return om

[PATCH] D53141: [OpenMP][libomptarget] Add runtime function for pushing coalesced global records

2018-10-31 Thread Alexey Bataev via Phabricator via cfe-commits
ABataev added inline comments.



Comment at: libomptarget/deviceRTLs/nvptx/src/data_sharing.cu:442
+}
+  } while (!FrameP);
 

Do you realy need the loop here? Seems to me it is better to have it in `else` 
branch of the `if (IsWarpMaster)` statement


Repository:
  rOMP OpenMP

https://reviews.llvm.org/D53141



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D53141: [OpenMP][libomptarget] Add runtime function for pushing coalesced global records

2018-10-31 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea updated this revision to Diff 171988.
gtbercea added a comment.

  Address comments.


Repository:
  rOMP OpenMP

https://reviews.llvm.org/D53141

Files:
  libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
  libomptarget/deviceRTLs/nvptx/src/interface.h
  libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
  libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
  libomptarget/deviceRTLs/nvptx/src/supporti.h

Index: libomptarget/deviceRTLs/nvptx/src/supporti.h
===
--- libomptarget/deviceRTLs/nvptx/src/supporti.h
+++ libomptarget/deviceRTLs/nvptx/src/supporti.h
@@ -188,7 +188,6 @@
 {
   void *ptr = malloc(size);
   PRINT(LD_MEM, "malloc data of size %zu for %s: 0x%llx\n", size, msg, P64(ptr));
-  ASSERT(LT_SAFETY, ptr, "failed to allocate %zu bytes for %s\n", size, msg);
   return ptr;
 }
 
Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
===
--- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
+++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
@@ -123,7 +123,7 @@
 struct DataSharingStateTy {
   __kmpc_data_sharing_slot *SlotPtr[DS_Max_Warp_Number];
   void *StackPtr[DS_Max_Warp_Number];
-  void *FramePtr[DS_Max_Warp_Number];
+  void * volatile FramePtr[DS_Max_Warp_Number];
   int32_t ActiveThreads[DS_Max_Warp_Number];
 };
 // Additional worker slot type which is initialized with the default worker slot
Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
===
--- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
+++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
@@ -40,8 +40,6 @@
 INLINE unsigned smid() {
   unsigned id;
   asm("mov.u32 %0, %%smid;" : "=r"(id));
-  ASSERT0(LT_FUSSY, nsmid() <= MAX_SM,
-  "Expected number of SMs is less than reported.");
   return id;
 }
 
@@ -156,7 +154,6 @@
   //
   omptarget_nvptx_TaskDescr *newTaskDescr =
   omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId);
-  ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr");
   newTaskDescr->InitLevelOneTaskDescr(ThreadLimit,
   currTeamDescr.LevelZeroTaskDescr());
   newTaskDescr->ThreadLimit() = ThreadLimit;
Index: libomptarget/deviceRTLs/nvptx/src/interface.h
===
--- libomptarget/deviceRTLs/nvptx/src/interface.h
+++ libomptarget/deviceRTLs/nvptx/src/interface.h
@@ -478,6 +478,8 @@
 
 EXTERN void __kmpc_data_sharing_init_stack();
 EXTERN void __kmpc_data_sharing_init_stack_spmd();
+EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t size,
+int16_t UseSharedMemory);
 EXTERN void *__kmpc_data_sharing_push_stack(size_t size, int16_t UseSharedMemory);
 EXTERN void __kmpc_data_sharing_pop_stack(void *a);
 EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs);
Index: libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
===
--- libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
+++ libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
@@ -129,7 +129,7 @@
 
   __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
   void *&StackP = DataSharingState.StackPtr[WID];
-  void *&FrameP = DataSharingState.FramePtr[WID];
+  void * volatile &FrameP = DataSharingState.FramePtr[WID];
   int32_t &ActiveT = DataSharingState.ActiveThreads[WID];
 
   DSPRINT0(DSFLAG, "Save current slot/stack values.\n");
@@ -283,7 +283,7 @@
 
   __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
   void *&StackP = DataSharingState.StackPtr[WID];
-  void *&FrameP = DataSharingState.FramePtr[WID];
+  void * volatile &FrameP = DataSharingState.FramePtr[WID];
 
   SlotP = *SavedSharedSlot;
   StackP = *SavedSharedStack;
@@ -321,7 +321,7 @@
 
   DSPRINT(DSFLAG, "Source  warp: %d\n", SourceWID);
 
-  void *P = DataSharingState.FramePtr[SourceWID];
+  void * volatile P = DataSharingState.FramePtr[SourceWID];
   DSPRINT0(DSFLAG, "Exiting __kmpc_get_data_sharing_environment_frame\n");
   return P;
 }
@@ -369,96 +369,107 @@
   __threadfence_block();
 }
 
-// Called at the time of the kernel initialization. This is used to initilize
-// the list of references to shared variables and to pre-allocate global storage
-// for holding the globalized variables.
-//
-// By default the globalized variables are stored in global memory. If the
-// UseSharedMemory is set to true, the runtime will attempt to use shared memory
-// as long as the size requested fits the pre-allocated size.
-EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize,
-int16_t UseSharedMemory) {
+INLINE void* data_sharing_push_stack_common(size_t PushSize) {
   if (isRuntimeUninitialized()) {
 ASSERT0(LT_FUSSY, isSPMDMode(),
 "Expected SPMD mode with uninitialized runtime.");
-return omptarge

[PATCH] D53141: [OpenMP][libomptarget] Add runtime function for pushing coalesced global records

2018-10-24 Thread Alexey Bataev via Phabricator via cfe-commits
ABataev added inline comments.



Comment at: libomptarget/deviceRTLs/nvptx/src/data_sharing.cu:389
   unsigned WID = getWarpId();
+  // void * volatile FramePointer = 0;
   void *&FrameP = DataSharingState.FramePtr[WID];

This must be removed



Comment at: libomptarget/deviceRTLs/nvptx/src/data_sharing.cu:438
+// point to the start of the new frame held in StackP.
+//atomicExch((unsigned long long *)&FrameP, (unsigned long 
long)StackP);
+FrameP = StackP;

Also, must be removed



Comment at: libomptarget/deviceRTLs/nvptx/src/data_sharing.cu:444
+}
+  } while (!FrameP);
 

It is a very bad idea to have something like this without atomic instructions.
Also, for writing, you need to use atomic instructions (+, possibly, `volatile` 
data type). Otherwise, it leads to undefined behavior and problems during 
optimization.


Repository:
  rOMP OpenMP

https://reviews.llvm.org/D53141



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D53141: [OpenMP][libomptarget] Add runtime function for pushing coalesced global records

2018-10-19 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea updated this revision to Diff 170238.
gtbercea added a comment.

  Refactor.


Repository:
  rOMP OpenMP

https://reviews.llvm.org/D53141

Files:
  libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
  libomptarget/deviceRTLs/nvptx/src/interface.h
  libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
  libomptarget/deviceRTLs/nvptx/src/supporti.h

Index: libomptarget/deviceRTLs/nvptx/src/supporti.h
===
--- libomptarget/deviceRTLs/nvptx/src/supporti.h
+++ libomptarget/deviceRTLs/nvptx/src/supporti.h
@@ -188,7 +188,6 @@
 {
   void *ptr = malloc(size);
   PRINT(LD_MEM, "malloc data of size %zu for %s: 0x%llx\n", size, msg, P64(ptr));
-  ASSERT(LT_SAFETY, ptr, "failed to allocate %zu bytes for %s\n", size, msg);
   return ptr;
 }
 
Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
===
--- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
+++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
@@ -40,8 +40,6 @@
 INLINE unsigned smid() {
   unsigned id;
   asm("mov.u32 %0, %%smid;" : "=r"(id));
-  ASSERT0(LT_FUSSY, nsmid() <= MAX_SM,
-  "Expected number of SMs is less than reported.");
   return id;
 }
 
@@ -156,7 +154,6 @@
   //
   omptarget_nvptx_TaskDescr *newTaskDescr =
   omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId);
-  ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr");
   newTaskDescr->InitLevelOneTaskDescr(ThreadLimit,
   currTeamDescr.LevelZeroTaskDescr());
   newTaskDescr->ThreadLimit() = ThreadLimit;
Index: libomptarget/deviceRTLs/nvptx/src/interface.h
===
--- libomptarget/deviceRTLs/nvptx/src/interface.h
+++ libomptarget/deviceRTLs/nvptx/src/interface.h
@@ -478,6 +478,8 @@
 
 EXTERN void __kmpc_data_sharing_init_stack();
 EXTERN void __kmpc_data_sharing_init_stack_spmd();
+EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t size,
+int16_t UseSharedMemory);
 EXTERN void *__kmpc_data_sharing_push_stack(size_t size, int16_t UseSharedMemory);
 EXTERN void __kmpc_data_sharing_pop_stack(void *a);
 EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs);
Index: libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
===
--- libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
+++ libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
@@ -369,96 +369,109 @@
   __threadfence_block();
 }
 
-// Called at the time of the kernel initialization. This is used to initilize
-// the list of references to shared variables and to pre-allocate global storage
-// for holding the globalized variables.
-//
-// By default the globalized variables are stored in global memory. If the
-// UseSharedMemory is set to true, the runtime will attempt to use shared memory
-// as long as the size requested fits the pre-allocated size.
-EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize,
-int16_t UseSharedMemory) {
+INLINE void* data_sharing_push_stack_common(size_t PushSize) {
   if (isRuntimeUninitialized()) {
 ASSERT0(LT_FUSSY, isSPMDMode(),
 "Expected SPMD mode with uninitialized runtime.");
-return omptarget_nvptx_SimpleThreadPrivateContext::Allocate(DataSize);
+return omptarget_nvptx_SimpleThreadPrivateContext::Allocate(PushSize);
   }
 
+  // Only warp active master threads manage the stack.
+  bool IsWarpMaster = (getThreadId() % WARPSIZE) == 0;
+
   // Add worst-case padding to DataSize so that future stack allocations are
   // correctly aligned.
   const size_t Alignment = 8;
-  if (DataSize % Alignment != 0) {
-DataSize += (Alignment - DataSize % Alignment);
-  }
+  PushSize = (PushSize + (Alignment - 1)) / Alignment * Alignment;
 
   // Frame pointer must be visible to all workers in the same warp.
   unsigned WID = getWarpId();
+  // void * volatile FramePointer = 0;
   void *&FrameP = DataSharingState.FramePtr[WID];
 
-  // Only warp active master threads manage the stack.
-  if (getThreadId() % WARPSIZE == 0) {
-// SlotP will point to either the shared memory slot or an existing
-// global memory slot.
-__kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
-void *&StackP = DataSharingState.StackPtr[WID];
-
-// Compute the total memory footprint of the requested data.
-// The master thread requires a stack only for itself. A worker
-// thread (which at this point is a warp master) will require
-// space for the variables of each thread in the warp,
-// i.e. one DataSize chunk per warp lane.
-// TODO: change WARPSIZE to the number of active threads in the warp.
-size_t PushSize = IsMasterThread() ? DataSize : WARPSIZE * DataSize;
+  do {
+if (IsWarpMaster) {
+  // SlotP will point to either the shared memory slot or an existing
+  // global memory slot

[PATCH] D53141: [OpenMP][libomptarget] Add runtime function for pushing coalesced global records

2018-10-19 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea updated this revision to Diff 170236.
gtbercea added a comment.
Herald added subscribers: cfe-commits, jholewinski.

  Refactor.


Repository:
  rC Clang

https://reviews.llvm.org/D53141

Files:
  lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp


Index: lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
===
--- lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
+++ lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
@@ -4238,16 +4238,17 @@
 Chunk = CGF.EmitScalarConversion(getNVPTXNumThreads(CGF),
 CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
 S.getIterationVariable()->getType(), S.getBeginLoc());
+return;
   }
+  CGOpenMPRuntime::getDefaultDistScheduleAndChunk(
+  CGF, S, ScheduleKind, Chunk);
 }
 
 void CGOpenMPRuntimeNVPTX::getDefaultScheduleAndChunk(
 CodeGenFunction &CGF, const OMPLoopDirective &S,
 OpenMPScheduleClauseKind &ScheduleKind,
 llvm::Value *&Chunk) const {
-  if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) {
-ScheduleKind = OMPC_SCHEDULE_static;
-Chunk = CGF.Builder.getIntN(CGF.getContext().getTypeSize(
-S.getIterationVariable()->getType()), 1);
-  }
+  ScheduleKind = OMPC_SCHEDULE_static;
+  Chunk = CGF.Builder.getIntN(CGF.getContext().getTypeSize(
+  S.getIterationVariable()->getType()), 1);
 }


Index: lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
===
--- lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
+++ lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
@@ -4238,16 +4238,17 @@
 Chunk = CGF.EmitScalarConversion(getNVPTXNumThreads(CGF),
 CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
 S.getIterationVariable()->getType(), S.getBeginLoc());
+return;
   }
+  CGOpenMPRuntime::getDefaultDistScheduleAndChunk(
+  CGF, S, ScheduleKind, Chunk);
 }
 
 void CGOpenMPRuntimeNVPTX::getDefaultScheduleAndChunk(
 CodeGenFunction &CGF, const OMPLoopDirective &S,
 OpenMPScheduleClauseKind &ScheduleKind,
 llvm::Value *&Chunk) const {
-  if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) {
-ScheduleKind = OMPC_SCHEDULE_static;
-Chunk = CGF.Builder.getIntN(CGF.getContext().getTypeSize(
-S.getIterationVariable()->getType()), 1);
-  }
+  ScheduleKind = OMPC_SCHEDULE_static;
+  Chunk = CGF.Builder.getIntN(CGF.getContext().getTypeSize(
+  S.getIterationVariable()->getType()), 1);
 }
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits