This patch implements the libgomp plugin part of the transition to
host-allocated soft stacks.  For now only a simple scheme with
allocation/deallocation per launch is implemented; a followup change is
planned to cache and reuse allocations when appropriate.

The call to cuLaunchKernel is changed to pass kernel entry function arguments
in a way that allows the driver to check for mismatch (but only when the
cumulative size of passed arguments is different).

        * plugin/plugin-nvptx.c (nvptx_stacks_size): New.
        (nvptx_stacks_alloc): New.
        (nvptx_stacks_free): New.
        (GOMP_OFFLOAD_run): Allocate soft-stacks storage from the host using
        the above new functions.  Use kernel launch interface that allows
        checking for mismatched total size of entry function arguments.
 
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index cb6a3ac..adf57b1 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -1892,6 +1892,37 @@ nvptx_adjust_launch_bounds (struct targ_fn_descriptor 
*fn,
     *teams_p = max_blocks;
 }
 
+/* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
+   target regions.  */
+
+static size_t
+nvptx_stacks_size ()
+{
+  return 128 * 1024;
+}
+
+/* Return contiguous storage for NUM stacks, each SIZE bytes.  */
+
+static void *
+nvptx_stacks_alloc (size_t size, int num)
+{
+  CUdeviceptr stacks;
+  CUresult r = cuMemAlloc (&stacks, size * num);
+  if (r != CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
+  return (void *) stacks;
+}
+
+/* Release storage previously allocated by nvptx_stacks_alloc.  */
+
+static void
+nvptx_stacks_free (void *p, int num)
+{
+  CUresult r = cuMemFree ((CUdeviceptr) p);
+  if (r != CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
+}
+
 void
 GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
 {
@@ -1899,7 +1930,6 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, 
void **args)
   CUresult r;
   struct ptx_device *ptx_dev = ptx_devices[ord];
   const char *maybe_abort_msg = "(perhaps abort was called)";
-  void *fn_args = &tgt_vars;
   int teams = 0, threads = 0;
 
   if (!args)
@@ -1922,10 +1952,19 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void 
*tgt_vars, void **args)
     }
   nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
 
+  size_t stack_size = nvptx_stacks_size ();
+  void *stacks = nvptx_stacks_alloc (stack_size, teams * threads);
+  void *fn_args[] = {tgt_vars, stacks, (void *) stack_size};
+  size_t fn_args_size = sizeof fn_args;
+  void *config[] = {
+    CU_LAUNCH_PARAM_BUFFER_POINTER, fn_args,
+    CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size,
+    CU_LAUNCH_PARAM_END
+  };
   r = cuLaunchKernel (function,
                      teams, 1, 1,
                      32, threads, 1,
-                     0, ptx_dev->null_stream->stream, &fn_args, 0);
+                     0, ptx_dev->null_stream->stream, NULL, config);
   if (r != CUDA_SUCCESS)
     GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
 
@@ -1935,6 +1974,7 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, 
void **args)
                       maybe_abort_msg);
   else if (r != CUDA_SUCCESS)
     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
+  nvptx_stacks_free (stacks, teams * threads);
 }
 
 void

Reply via email to