gtbercea created this revision.
gtbercea added reviewers: carlo.bertolli, ABataev, Hahnfeld, grokos, caomhin, 
hfinkel.

This patch extends the libomptarget functionality in patch 
https://reviews.llvm.org/D14254 with support for the data sharing scheme for 
supporting implicitly shared variables. The runtime therefore maintains a list 
of references to shared variables.


Repository:
  rL LLVM

https://reviews.llvm.org/D41485

Files:
  libomptarget/deviceRTLs/nvptx/src/interface.h
  libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
  libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
  libomptarget/deviceRTLs/nvptx/src/option.h
  libomptarget/deviceRTLs/nvptx/src/parallel.cu

Index: libomptarget/deviceRTLs/nvptx/src/parallel.cu
===================================================================
--- libomptarget/deviceRTLs/nvptx/src/parallel.cu
+++ libomptarget/deviceRTLs/nvptx/src/parallel.cu
@@ -210,10 +210,16 @@
 //    }
 //
 // This routine is always called by the team master..
-EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn, int16_t IsOMPRuntimeInitialized) {
+EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn, int16_t IsOMPRuntimeInitialized,
+                                           void ***SharedArgs, int32_t nArgs) {
   PRINT0(LD_IO, "call to __kmpc_kernel_prepare_parallel\n");
   omptarget_nvptx_workFn = WorkFn;
 
+  if (nArgs > 0) {
+    omptarget_nvptx_sharedArgs.EnsureSize(nArgs);
+    *SharedArgs = omptarget_nvptx_sharedArgs.GetArgs();
+  }
+
   if (!IsOMPRuntimeInitialized) return;
 
   // This routine is only called by the team master.  The team master is
@@ -310,11 +316,13 @@
 // returns True if this thread is active, else False.
 //
 // Only the worker threads call this routine.
-EXTERN bool __kmpc_kernel_parallel(void **WorkFn, int16_t IsOMPRuntimeInitialized) {
+EXTERN bool __kmpc_kernel_parallel(void **WorkFn, int16_t IsOMPRuntimeInitialized,
+                                   void ***SharedArgs) {
   PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_parallel\n");
 
   // Work function and arguments for L1 parallel region.
   *WorkFn   = omptarget_nvptx_workFn;
+  *SharedArgs = omptarget_nvptx_sharedArgs.GetArgs();
 
   if (!IsOMPRuntimeInitialized) return true;
 
Index: libomptarget/deviceRTLs/nvptx/src/option.h
===================================================================
--- libomptarget/deviceRTLs/nvptx/src/option.h
+++ libomptarget/deviceRTLs/nvptx/src/option.h
@@ -46,6 +46,10 @@
 // to synchronize with each other.
 #define L1_BARRIER (1)
 
+// Maximum number of preallocated arguments to an outlined parallel/simd function.
+// Anything more requires dynamic memory allocation.
+#define MAX_SHARED_ARGS 20
+
 // Maximum number of omp state objects per SM allocated statically in global memory.
 #if __CUDA_ARCH__ >= 600
 #define OMP_STATE_COUNT 32
Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
===================================================================
--- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
+++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
@@ -60,6 +60,46 @@
 #define __ACTIVEMASK() __ballot(1)
 #endif
 
+// arguments needed for L0 parallelism only.
+class omptarget_nvptx_SharedArgs {
+public:
+  // All these methods must be called by the master thread only.
+  INLINE void Init() {
+    args  = buffer;
+    nArgs = MAX_SHARED_ARGS;
+  }
+  INLINE void DeInit() {
+    // Free any memory allocated for outlined parallel function with a large
+    // number of arguments.
+    if (nArgs > MAX_SHARED_ARGS) {
+      SafeFree(args, (char *)"new extended args");
+      Init();
+    }
+  }
+  INLINE void EnsureSize(int size) {
+    if (size > nArgs) {
+      if (nArgs > MAX_SHARED_ARGS) {
+        SafeFree(args, (char *)"new extended args");
+      }
+      args = (void **) SafeMalloc(size * sizeof(void *),
+                                  (char *)"new extended args");
+      nArgs = size;
+    }
+  }
+  // Called by all threads.
+  INLINE void **GetArgs() { return args; };
+private:
+  // buffer of pre-allocated arguments.
+  void *buffer[MAX_SHARED_ARGS];
+  // pointer to arguments buffer.
+  // starts off as a pointer to 'buffer' but can be dynamically allocated.
+  void **args;
+  // starts off as MAX_SHARED_ARGS but can increase in size.
+  uint32_t nArgs;
+};
+
+extern __device__ __shared__ omptarget_nvptx_SharedArgs omptarget_nvptx_sharedArgs;
+
 // Data sharing related quantities, need to match what is used in the compiler.
 enum DATA_SHARING_SIZES {
   // The maximum number of workers in a kernel.
Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
===================================================================
--- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
+++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
@@ -32,6 +32,7 @@
 //
 extern volatile __device__ __shared__ omptarget_nvptx_WorkFn   omptarget_nvptx_workFn;
 extern __device__ __shared__ uint32_t execution_param;
+__device__ __shared__ omptarget_nvptx_SharedArgs omptarget_nvptx_sharedArgs;
 
 ////////////////////////////////////////////////////////////////////////////////
 // init entry points
@@ -61,6 +62,9 @@
   PRINT(LD_IO, "call to __kmpc_kernel_init with version %f\n",
         OMPTARGET_NVPTX_VERSION);
 
+  // init parallel work arguments
+  omptarget_nvptx_sharedArgs.Init();
+
   if (!RequiresOMPRuntime) {
     // If OMP runtime is not required don't initialize OMP state.
     setExecutionParameters(Generic, RuntimeUninitialized);
@@ -114,6 +118,9 @@
   }
   // Done with work.  Kill the workers.
   omptarget_nvptx_workFn = 0;
+
+  // Deinit parallel work arguments
+  omptarget_nvptx_sharedArgs.DeInit();
 }
 
 EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit,
Index: libomptarget/deviceRTLs/nvptx/src/interface.h
===================================================================
--- libomptarget/deviceRTLs/nvptx/src/interface.h
+++ libomptarget/deviceRTLs/nvptx/src/interface.h
@@ -1287,8 +1287,8 @@
                                     int16_t RequiresOMPRuntime,
                                     int16_t RequiresDataSharing);
 EXTERN void __kmpc_spmd_kernel_deinit();
-EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn, int16_t IsOMPRuntimeInitialized);
-EXTERN bool __kmpc_kernel_parallel(void **WorkFn, int16_t IsOMPRuntimeInitialized);
+EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn, int16_t IsOMPRuntimeInitialized, void ***SharedArgs, int32_t nArgs);
+EXTERN bool __kmpc_kernel_parallel(void **WorkFn, int16_t IsOMPRuntimeInitialized, void ***SharedArgs);
 EXTERN void __kmpc_kernel_end_parallel();
 EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer, uint32_t Mask, bool *IsFinal, int32_t *LaneSource);
 EXTERN void __kmpc_kernel_end_convergent_parallel(void *buffer);
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
  • [PATCH] D41485: [Op... Gheorghe-Teodor Bercea via Phabricator via cfe-commits

Reply via email to