[PATCH] D53141: [OpenMP][libomptarget] Add runtime function for pushing coalesced global records

Phabricator via Phabricator via cfe-commits Thu, 01 Nov 2018 11:11:23 -0700

This revision was automatically updated to reflect the committed changes.
Closed by commit rOMP345867: [OpenMP][libomptarget] Add runtime function for 
pushing coalesced global records (authored by gbercea, committed by ).


Changed prior to commit:
  https://reviews.llvm.org/D53141?vs=172007&id=172184#toc

Repository:
  rOMP OpenMP

https://reviews.llvm.org/D53141

Files:
  libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
  libomptarget/deviceRTLs/nvptx/src/interface.h
  libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
  libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
  libomptarget/deviceRTLs/nvptx/src/supporti.h

Index: libomptarget/deviceRTLs/nvptx/src/interface.h
===================================================================
--- libomptarget/deviceRTLs/nvptx/src/interface.h
+++ libomptarget/deviceRTLs/nvptx/src/interface.h
@@ -478,6 +478,8 @@
 
 EXTERN void __kmpc_data_sharing_init_stack();
 EXTERN void __kmpc_data_sharing_init_stack_spmd();
+EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t size,
+    int16_t UseSharedMemory);
 EXTERN void *__kmpc_data_sharing_push_stack(size_t size, int16_t UseSharedMemory);
 EXTERN void __kmpc_data_sharing_pop_stack(void *a);
 EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs);
Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
===================================================================
--- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
+++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
@@ -40,8 +40,6 @@
 INLINE unsigned smid() {
   unsigned id;
   asm("mov.u32 %0, %%smid;" : "=r"(id));
-  ASSERT0(LT_FUSSY, nsmid() <= MAX_SM,
-          "Expected number of SMs is less than reported.");
   return id;
 }
 
Index: libomptarget/deviceRTLs/nvptx/src/supporti.h
===================================================================
--- libomptarget/deviceRTLs/nvptx/src/supporti.h
+++ libomptarget/deviceRTLs/nvptx/src/supporti.h
@@ -188,7 +188,6 @@
 {
   void *ptr = malloc(size);
   PRINT(LD_MEM, "malloc data of size %zu for %s: 0x%llx\n", size, msg, P64(ptr));
-  ASSERT(LT_SAFETY, ptr, "failed to allocate %zu bytes for %s\n", size, msg);
   return ptr;
 }
 
Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
===================================================================
--- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
+++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
@@ -123,7 +123,7 @@
 struct DataSharingStateTy {
   __kmpc_data_sharing_slot *SlotPtr[DS_Max_Warp_Number];
   void *StackPtr[DS_Max_Warp_Number];
-  void *FramePtr[DS_Max_Warp_Number];
+  void * volatile FramePtr[DS_Max_Warp_Number];
   int32_t ActiveThreads[DS_Max_Warp_Number];
 };
 // Additional worker slot type which is initialized with the default worker slot
Index: libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
===================================================================
--- libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
+++ libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
@@ -129,7 +129,7 @@
 
   __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
   void *&StackP = DataSharingState.StackPtr[WID];
-  void *&FrameP = DataSharingState.FramePtr[WID];
+  void * volatile &FrameP = DataSharingState.FramePtr[WID];
   int32_t &ActiveT = DataSharingState.ActiveThreads[WID];
 
   DSPRINT0(DSFLAG, "Save current slot/stack values.\n");
@@ -283,7 +283,7 @@
 
       __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
       void *&StackP = DataSharingState.StackPtr[WID];
-      void *&FrameP = DataSharingState.FramePtr[WID];
+      void * volatile &FrameP = DataSharingState.FramePtr[WID];
 
       SlotP = *SavedSharedSlot;
       StackP = *SavedSharedStack;
@@ -321,7 +321,7 @@
 
   DSPRINT(DSFLAG, "Source  warp: %d\n", SourceWID);
 
-  void *P = DataSharingState.FramePtr[SourceWID];
+  void * volatile P = DataSharingState.FramePtr[SourceWID];
   DSPRINT0(DSFLAG, "Exiting __kmpc_get_data_sharing_environment_frame\n");
   return P;
 }
@@ -369,47 +369,31 @@
   __threadfence_block();
 }
 
-// Called at the time of the kernel initialization. This is used to initilize
-// the list of references to shared variables and to pre-allocate global storage
-// for holding the globalized variables.
-//
-// By default the globalized variables are stored in global memory. If the
-// UseSharedMemory is set to true, the runtime will attempt to use shared memory
-// as long as the size requested fits the pre-allocated size.
-EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize,
-    int16_t UseSharedMemory) {
+INLINE void* data_sharing_push_stack_common(size_t PushSize) {
   if (isRuntimeUninitialized()) {
     ASSERT0(LT_FUSSY, isSPMDMode(),
             "Expected SPMD mode with uninitialized runtime.");
-    return omptarget_nvptx_SimpleThreadPrivateContext::Allocate(DataSize);
+    return omptarget_nvptx_SimpleThreadPrivateContext::Allocate(PushSize);
   }
 
+  // Only warp active master threads manage the stack.
+  bool IsWarpMaster = (getThreadId() % WARPSIZE) == 0;
+
   // Add worst-case padding to DataSize so that future stack allocations are
   // correctly aligned.
   const size_t Alignment = 8;
-  if (DataSize % Alignment != 0) {
-    DataSize += (Alignment - DataSize % Alignment);
-  }
+  PushSize = (PushSize + (Alignment - 1)) / Alignment * Alignment;
 
   // Frame pointer must be visible to all workers in the same warp.
   unsigned WID = getWarpId();
-  void *&FrameP = DataSharingState.FramePtr[WID];
+  void *volatile &FrameP = DataSharingState.FramePtr[WID];
 
-  // Only warp active master threads manage the stack.
-  if (getThreadId() % WARPSIZE == 0) {
+  if (IsWarpMaster) {
     // SlotP will point to either the shared memory slot or an existing
     // global memory slot.
     __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
     void *&StackP = DataSharingState.StackPtr[WID];
 
-    // Compute the total memory footprint of the requested data.
-    // The master thread requires a stack only for itself. A worker
-    // thread (which at this point is a warp master) will require
-    // space for the variables of each thread in the warp,
-    // i.e. one DataSize chunk per warp lane.
-    // TODO: change WARPSIZE to the number of active threads in the warp.
-    size_t PushSize = IsMasterThread() ? DataSize : WARPSIZE * DataSize;
-
     // Check if we have room for the data in the current slot.
     const uintptr_t StartAddress = (uintptr_t)StackP;
     const uintptr_t EndAddress = (uintptr_t)SlotP->DataEnd;
@@ -453,12 +437,39 @@
       // Reset stack pointer to the requested address.
       StackP = (void *)RequestedEndAddress;
     }
+  } else {
+    while (!FrameP);
   }
 
-  __threadfence_block();
+  return FrameP;
+}
+
+EXTERN void* __kmpc_data_sharing_coalesced_push_stack(size_t DataSize,
+    int16_t UseSharedMemory) {
+  return data_sharing_push_stack_common(DataSize);
+}
+
+// Called at the time of the kernel initialization. This is used to initilize
+// the list of references to shared variables and to pre-allocate global storage
+// for holding the globalized variables.
+//
+// By default the globalized variables are stored in global memory. If the
+// UseSharedMemory is set to true, the runtime will attempt to use shared memory
+// as long as the size requested fits the pre-allocated size.
+EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize,
+    int16_t UseSharedMemory) {
+  // Compute the total memory footprint of the requested data.
+  // The master thread requires a stack only for itself. A worker
+  // thread (which at this point is a warp master) will require
+  // space for the variables of each thread in the warp,
+  // i.e. one DataSize chunk per warp lane.
+  // TODO: change WARPSIZE to the number of active threads in the warp.
+  size_t PushSize = (isRuntimeUninitialized() || IsMasterThread()) ?
+      DataSize : WARPSIZE * DataSize;
 
   // Compute the start address of the frame of each thread in the warp.
-  uintptr_t FrameStartAddress = (uintptr_t)FrameP;
+  uintptr_t FrameStartAddress =
+      (uintptr_t) data_sharing_push_stack_common(PushSize);
   FrameStartAddress += (uintptr_t) (getLaneId() * DataSize);
   return (void *)FrameStartAddress;
 }
@@ -475,6 +486,8 @@
     return omptarget_nvptx_SimpleThreadPrivateContext::Deallocate(FrameStart);
   }
 
+  __threadfence_block();
+
   if (getThreadId() % WARPSIZE == 0) {
     unsigned WID = getWarpId();
 
@@ -501,8 +514,6 @@
       SlotP->Next = 0;
     }
   }
-
-  __threadfence_block();
 }
 
 // Begin a data sharing context. Maintain a list of references to shared

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D53141: [OpenMP][libomptarget] Add runtime function for pushing coalesced global records

Reply via email to