[PATCH, nvptx] Expand OpenACC child function arguments to use CUDA params space

Chung-Lin Tang Tue, 10 Sep 2019 04:42:59 -0700

Hi Tom,
this is a completely new implementation of an earlier optimization
that Cesar submitted:
https://gcc.gnu.org/ml/gcc-patches/2017-12/msg01202.html


The objective is to transform the original single-record-pointer argument
form (OpenMP/pthreads originated) to multiple scalar parameters, that
the CUDA runtime will place directly in the .params space for GPU kernels:

#pragma acc parallel copy(a, b) copyin(c)
{
  a += b;
  b -= c;
}

compiles to GIMPLE as:

__attribute__((oacc function (1, 1, 32), omp target entrypoint))
main._omp_fn.0 (const struct .omp_data_t.8 & restrict .omp_data_i)
{
  ...
  _3 = .omp_data_i_2(D)->a;
  _4 = *_3;
  _5 = .omp_data_i_2(D)->b;
  _6 = *_5;
  ...

this patch adds pass to transform into:

__attribute__((oacc function (1, 1, 32), omp target entrypoint))
main._omp_fn.0 (int * c, int * b, int * a)
{
  ...
  _3 = a;
  _4 = *_3;
  _5 = b;
  _6 = *_5;
  ...

Cesar's original implementation tried to do this in the middle-end,
which required lots of changes throughout the compiler, libgomp interface,
etc. and required a dependency on libffi for the CPU-host fallback child
function (since there is no longer a known, fixed single-pointer argument
interface to all child functions)

This new implementation works by modifying the GIMPLE for child functions
directly at the very start (before, actually) of RTL expansion, and thus
is placed in TARGET_EXPAND_TO_RTL_HOOK, as the core issue is we inherently
need something different generated between the host-fallback vs for the GPU.

The new nvptx_expand_to_rtl_hook modifies the function decl type and
arguments, and scans the gimple body to remove occurrences of .omp_data_i.*
Detection of OpenACC child functions is done through "omp target entrypoint"
and "oacc function" attributes. Because OpenMP target child functions
have a more elaborate wrapper generated for them, this pass only supports
OpenACC right now.

The libgomp nvptx plugin changes are also quite contained, with lots of
now unneeded profiling code deleted (since we no longer first cuAlloc a
buffer for the argument record before cuLaunchKernel)

libgomp has tested with this patch x86_64-linux (nvptx-none accelerator)
without regressions (I'm currently undergoing more gcc tests as well).
Is this okay for trunk?

Thanks,
Chung-Lin

        * config/nvptx/nvptx.c (nvptx_expand_to_rtl_hook): New function
        implementing CUDA .params space transformation.
        (TARGET_EXPAND_TO_RTL_HOOK): implement hook with
        nvptx_expand_to_rtl_hook.

        libgomp/
        * plugin/plugin-nvptx.c (nvptx_exec): Adjust arguments, add
        kernel argument setup code, adjust cuLaunchKernel calling code.
        (GOMP_OFFLOAD_openacc_exec): Adjust nvptx_exec call, delete
        profiling code.
        (GOMP_OFFLOAD_openacc_async_exec): Likewise.
        (cuda_free_argmem): Delete function.

Index: gcc/config/nvptx/nvptx.c
===================================================================
--- gcc/config/nvptx/nvptx.c    (revision 275493)
+++ gcc/config/nvptx/nvptx.c    (working copy)
@@ -68,6 +68,10 @@
 #include "attribs.h"
 #include "tree-vrp.h"
 #include "tree-ssa-operands.h"
+#include "tree-pretty-print.h"
+#include "gimple-pretty-print.h"
+#include "tree-cfg.h"
+#include "gimple-ssa.h"
 #include "tree-ssanames.h"
 #include "gimplify.h"
 #include "tree-phinodes.h"
@@ -6437,6 +6441,228 @@ nvptx_set_current_function (tree fndecl)
   oacc_bcast_partition = 0;
 }
 
+static void
+nvptx_expand_to_rtl_hook (void)
+{
+  /* For utilizing CUDA .param kernel arguments, we detect and modify
+     the gimple of offloaded child functions, here before RTL expansion,
+     starting with standard OMP form:
+      foo._omp_fn.0 (const struct .omp_data_t.8 & restrict .omp_data_i) { ... }
+   
+     and transform it into a style where the OMP data record fields are
+     "exploded" into individual scalar arguments:
+      foo._omp_fn.0 (int * a, int * b, int * c) { ... }
+
+     Note that there are implicit assumptions of how OMP lowering (and/or other
+     intervening passes) behaves contained in this transformation code;
+     if those passes change in their output, this code may possibly need
+     updating.  */
+
+  if (lookup_attribute ("omp target entrypoint",
+                       DECL_ATTRIBUTES (current_function_decl))
+      /* The rather indirect manner in which OpenMP target functions are
+        launched makes this transformation only valid for OpenACC currently.
+        TODO: e.g. write_omp_entry(), nvptx_declare_function_name(), etc.
+        needs changes for this to work with OpenMP.  */
+      && lookup_attribute ("oacc function",
+                          DECL_ATTRIBUTES (current_function_decl))
+      && VOID_TYPE_P (TREE_TYPE (DECL_RESULT (current_function_decl))))
+    {
+      tree omp_data_arg = DECL_ARGUMENTS (current_function_decl);
+      tree argtype = TREE_TYPE (omp_data_arg);
+
+      /* Ensure this function is of the form of a single reference argument
+        to the OMP data record, or a single void* argument (when no values
+        passed)  */
+      if (! (DECL_CHAIN (omp_data_arg) == NULL_TREE
+            && ((TREE_CODE (argtype) == REFERENCE_TYPE
+                 && TREE_CODE (TREE_TYPE (argtype)) == RECORD_TYPE)
+                || (TREE_CODE (argtype) == POINTER_TYPE
+                    && TREE_TYPE (argtype) == void_type_node))))
+       return;
+
+      if (dump_file)
+       {
+         fprintf (dump_file, "Detected offloaded child function %s, "
+                  "starting parameter conversion\n",
+                  print_generic_expr_to_str (current_function_decl));
+         fprintf (dump_file, "OMP data record argument: %s (tree type: %s)\n",
+                  print_generic_expr_to_str (omp_data_arg),
+                  print_generic_expr_to_str (argtype));
+         fprintf (dump_file, "Data record fields:\n");
+       }
+      
+      hash_map<tree,tree> fld_to_args;
+      tree fld, rectype = TREE_TYPE (argtype);
+      tree arglist = NULL_TREE, argtypelist = NULL_TREE;
+
+      if (TREE_CODE (rectype) == RECORD_TYPE)
+       {
+         /* For each field in the OMP data record type, create a corresponding
+            PARM_DECL, and map field -> parm using the fld_to_args hash_map.
+            Also create the tree chains for creating function type and
+            DECL_ARGUMENTS below.  */
+         for (fld = TYPE_FIELDS (rectype); fld; fld = DECL_CHAIN (fld))
+           {
+             tree narg = build_decl (DECL_SOURCE_LOCATION (fld), PARM_DECL,
+                                     DECL_NAME (fld), TREE_TYPE (fld));
+             DECL_ARTIFICIAL (narg) = 1;
+             DECL_ARG_TYPE (narg) = TREE_TYPE (fld);
+             DECL_CONTEXT (narg) = current_function_decl;
+             TREE_USED (narg) = 1;
+             TREE_READONLY (narg) = 1;
+
+             if (dump_file)
+               fprintf (dump_file, "\t%s, type: %s, offset: %s bytes + %s 
bits\n",
+                        print_generic_expr_to_str (fld),
+                        print_generic_expr_to_str (TREE_TYPE (fld)),
+                        print_generic_expr_to_str (DECL_FIELD_OFFSET (fld)),
+                        print_generic_expr_to_str (DECL_FIELD_BIT_OFFSET 
(fld)));
+             fld_to_args.put (fld, narg);
+
+             TREE_CHAIN (narg) = arglist;
+             arglist = narg;
+             argtypelist = tree_cons (NULL_TREE, TREE_TYPE (narg),
+                                      argtypelist);
+           }
+         arglist = nreverse (arglist);
+         argtypelist = nreverse (argtypelist);
+       }
+      /* This is needed to not be mistaken for a stdarg function.  */
+      argtypelist = chainon (argtypelist, void_list_node);
+
+      if (dump_file)
+       {
+         fprintf (dump_file, "Function before OMP data arg replaced:\n");
+         dump_function_to_file (current_function_decl, dump_file, dump_flags);
+       }
+
+      /* Actually modify the tree type and DECL_ARGUMENTS here.  */
+      TREE_TYPE (current_function_decl) = build_function_type (void_type_node,
+                                                              argtypelist);
+      DECL_ARGUMENTS (current_function_decl) = arglist;
+
+      /* Remove local decls which correspond to *.omp_data_i->FIELD entries, by
+        scanning and skipping those entries, creating a new local_decls list.
+        We assume a very specific MEM_REF tree expression shape.  */
+      tree decl;
+      unsigned int i;
+      vec<tree, va_gc> *new_local_decls = NULL;
+      FOR_EACH_VEC_SAFE_ELT (cfun->local_decls, i, decl)
+       {
+         if (DECL_HAS_VALUE_EXPR_P (decl))
+           {
+             tree t = DECL_VALUE_EXPR (decl);
+             if (TREE_CODE (t) == MEM_REF
+                 && TREE_CODE (TREE_OPERAND (t, 0)) == COMPONENT_REF
+                 && TREE_CODE (TREE_OPERAND (TREE_OPERAND (t, 0), 0)) == 
MEM_REF
+                 && (TREE_OPERAND (TREE_OPERAND (TREE_OPERAND (t, 0), 0), 0)
+                     == omp_data_arg))
+               continue;
+           }
+         vec_safe_push (new_local_decls, decl);
+       }
+      vec_free (cfun->local_decls);
+      cfun->local_decls = new_local_decls;
+      
+      /* Scan function body for assignments from .omp_data_i->FIELD, and using
+        the above created fld_to_args hash map, convert them to reads of
+        function arguments.  */
+      basic_block bb;
+      gimple_stmt_iterator gsi;
+      FOR_EACH_BB_FN (bb, cfun)
+       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+         {
+           tree val, *val_ptr = NULL;
+           gimple *stmt = gsi_stmt (gsi);
+           if (is_gimple_assign (stmt)
+               && gimple_assign_rhs_class (stmt) == GIMPLE_SINGLE_RHS)
+             val_ptr = gimple_assign_rhs1_ptr (stmt);
+           else if (is_gimple_debug (stmt) && gimple_debug_bind_p (stmt))
+             val_ptr = gimple_debug_bind_get_value_ptr (stmt);
+
+           if (val_ptr == NULL || (val = *val_ptr) == NULL_TREE)
+             continue;
+
+           tree new_val = NULL_TREE, fld = NULL_TREE;
+
+           if (TREE_CODE (val) == COMPONENT_REF
+               && TREE_CODE (TREE_OPERAND (val, 0)) == MEM_REF
+               && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (val, 0), 0))
+                   == SSA_NAME)
+               && (SSA_NAME_VAR (TREE_OPERAND (TREE_OPERAND (val, 0), 0))
+                   == omp_data_arg))
+             {
+               /* .omp_data->FIELD case.  */
+               fld = TREE_OPERAND (val, 1);
+               new_val = *fld_to_args.get (fld);
+             }
+           else if (TREE_CODE (val) == MEM_REF
+                    && TREE_CODE (TREE_OPERAND (val, 0)) == SSA_NAME
+                    && SSA_NAME_VAR (TREE_OPERAND (val, 0)) == omp_data_arg)
+             {
+               /* This case may happen in the final tree level optimization
+                  output, due to SLP:
+                  vect.XX = MEM <vector(1) unsigned long> [(void 
*).omp_data_i_5(D) + 8B]
+
+                  Therefore here we need a more elaborate search of the field
+                  list to reverse map to which field the offset is referring
+                  to.  */
+               unsigned HOST_WIDE_INT offset
+                 = tree_to_uhwi (TREE_OPERAND (val, 1));
+
+               for (hash_map<tree, tree>::iterator i = fld_to_args.begin ();
+                    i != fld_to_args.end (); ++i)
+                 {
+                   tree cur_fld = (*i).first;
+                   tree cur_arg = (*i).second;
+                   gcc_assert (TREE_CODE (cur_arg) == PARM_DECL);
+
+                   unsigned HOST_WIDE_INT cur_offset =
+                     (tree_to_uhwi (DECL_FIELD_OFFSET (cur_fld))
+                      + (tree_to_uhwi (DECL_FIELD_BIT_OFFSET (cur_fld))
+                         / BITS_PER_UNIT));
+
+                   if (offset == cur_offset)
+                     {
+                       new_val = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (val),
+                                         cur_arg);
+                       break;
+                     }
+                 }
+             }
+
+           /* If we found the corresponding OMP data record field, replace the
+              RHS with the new created PARM_DECL.  */
+           if (new_val != NULL_TREE)
+             {
+               if (dump_file)
+                 {
+                   fprintf (dump_file, "For gimple stmt: ");
+                   print_gimple_stmt (dump_file, stmt, 0);
+                   fprintf (dump_file, "\tReplacing OMP recv ref %s with %s\n",
+                            print_generic_expr_to_str (val),
+                            print_generic_expr_to_str (new_val));
+                 }
+               /* Write in looked up ARG as new RHS value.  */
+               *val_ptr = new_val;
+             }
+         }
+
+      /* Delete SSA_NAMEs of .omp_data_i by setting them to NULL_TREE.  */
+      tree name;
+      FOR_EACH_SSA_NAME (i, name, cfun)
+       if (SSA_NAME_VAR (name) == omp_data_arg)
+         (*SSANAMES (cfun))[SSA_NAME_VERSION (name)] = NULL_TREE;
+
+      if (dump_file)
+       {
+         fprintf (dump_file, "Function after OMP data arg replaced: ");
+         dump_function_to_file (current_function_decl, dump_file, dump_flags);
+       }
+    }
+}
+
 #undef TARGET_OPTION_OVERRIDE
 #define TARGET_OPTION_OVERRIDE nvptx_option_override
 
@@ -6576,6 +6802,9 @@ nvptx_set_current_function (tree fndecl)
 #undef TARGET_SET_CURRENT_FUNCTION
 #define TARGET_SET_CURRENT_FUNCTION nvptx_set_current_function
 
+#undef TARGET_EXPAND_TO_RTL_HOOK
+#define TARGET_EXPAND_TO_RTL_HOOK nvptx_expand_to_rtl_hook
+
 struct gcc_target targetm = TARGET_INITIALIZER;
 
 #include "gt-nvptx.h"
Index: libgomp/plugin/plugin-nvptx.c
===================================================================
--- libgomp/plugin/plugin-nvptx.c       (revision 275493)
+++ libgomp/plugin/plugin-nvptx.c       (working copy)
@@ -696,16 +696,24 @@ link_ptx (CUmodule *module, const struct targ_ptx_
 
 static void
 nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
-           unsigned *dims, void *targ_mem_desc,
-           CUdeviceptr dp, CUstream stream)
+           unsigned *dims, CUstream stream)
 {
   struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
   CUfunction function;
   int i;
-  void *kargs[1];
   struct nvptx_thread *nvthd = nvptx_thread ();
   int warp_size = nvthd->ptx_dev->warp_size;
+  void **kernel_args = NULL;
 
+  GOMP_PLUGIN_debug (0, "prepare mappings (mapnum: %u)\n", (unsigned) mapnum);
+
+  if (mapnum > 0)
+    {
+      kernel_args = alloca (mapnum * sizeof (void *));
+      for (int i = 0; i < mapnum; i++)
+       kernel_args[i] = (devaddrs[i] ? &devaddrs[i] : &hostaddrs[i]);
+    }
+  
   function = targ_fn->fn;
 
   /* Initialize the launch dimensions.  Typically this is constant,
@@ -937,11 +945,10 @@ nvptx_exec (void (*fn), size_t mapnum, void **host
                                            api_info);
     }
 
-  kargs[0] = &dp;
   CUDA_CALL_ASSERT (cuLaunchKernel, function,
                    dims[GOMP_DIM_GANG], 1, 1,
                    dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
-                   0, stream, kargs, 0);
+                   0, stream, kernel_args, 0);
 
   if (profiling_p)
     {
@@ -1350,67 +1357,8 @@ GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), si
                           void **hostaddrs, void **devaddrs,
                           unsigned *dims, void *targ_mem_desc)
 {
-  GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
+  nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, NULL);
 
-  struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
-  acc_prof_info *prof_info = thr->prof_info;
-  acc_event_info data_event_info;
-  acc_api_info *api_info = thr->api_info;
-  bool profiling_p = __builtin_expect (prof_info != NULL, false);
-
-  void **hp = NULL;
-  CUdeviceptr dp = 0;
-
-  if (mapnum > 0)
-    {
-      size_t s = mapnum * sizeof (void *);
-      hp = alloca (s);
-      for (int i = 0; i < mapnum; i++)
-       hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
-      CUDA_CALL_ASSERT (cuMemAlloc, &dp, s);
-      if (profiling_p)
-       goacc_profiling_acc_ev_alloc (thr, (void *) dp, s);
-    }
-
-  /* Copy the (device) pointers to arguments to the device (dp and hp might in
-     fact have the same value on a unified-memory system).  */
-  if (mapnum > 0)
-    {
-      if (profiling_p)
-       {
-         prof_info->event_type = acc_ev_enqueue_upload_start;
-
-         data_event_info.data_event.event_type = prof_info->event_type;
-         data_event_info.data_event.valid_bytes
-           = _ACC_DATA_EVENT_INFO_VALID_BYTES;
-         data_event_info.data_event.parent_construct
-           = acc_construct_parallel;
-         data_event_info.data_event.implicit = 1; /* Always implicit.  */
-         data_event_info.data_event.tool_info = NULL;
-         data_event_info.data_event.var_name = NULL;
-         data_event_info.data_event.bytes = mapnum * sizeof (void *);
-         data_event_info.data_event.host_ptr = hp;
-         data_event_info.data_event.device_ptr = (const void *) dp;
-
-         api_info->device_api = acc_device_api_cuda;
-
-         GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
-                                               api_info);
-       }
-      CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, (void *) hp,
-                       mapnum * sizeof (void *));
-      if (profiling_p)
-       {
-         prof_info->event_type = acc_ev_enqueue_upload_end;
-         data_event_info.data_event.event_type = prof_info->event_type;
-         GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
-                                               api_info);
-       }
-    }
-
-  nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
-             dp, NULL);
-
   CUresult r = CUDA_CALL_NOCHECK (cuStreamSynchronize, NULL);
   const char *maybe_abort_msg = "(perhaps abort was called)";
   if (r == CUDA_ERROR_LAUNCH_FAILED)
@@ -1418,20 +1366,8 @@ GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), si
                       maybe_abort_msg);
   else if (r != CUDA_SUCCESS)
     GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
-
-  CUDA_CALL_ASSERT (cuMemFree, dp);
-  if (profiling_p)
-    goacc_profiling_acc_ev_free (thr, (void *) dp);
 }
 
-static void
-cuda_free_argmem (void *ptr)
-{
-  void **block = (void **) ptr;
-  nvptx_free (block[0], (struct ptx_device *) block[1]);
-  free (block);
-}
-
 void
 GOMP_OFFLOAD_openacc_async_exec (void (*fn) (void *), size_t mapnum,
                                 void **hostaddrs, void **devaddrs,
@@ -1438,78 +1374,7 @@ GOMP_OFFLOAD_openacc_async_exec (void (*fn) (void
                                 unsigned *dims, void *targ_mem_desc,
                                 struct goacc_asyncqueue *aq)
 {
-  GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
-
-  struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
-  acc_prof_info *prof_info = thr->prof_info;
-  acc_event_info data_event_info;
-  acc_api_info *api_info = thr->api_info;
-  bool profiling_p = __builtin_expect (prof_info != NULL, false);
-
-  void **hp = NULL;
-  CUdeviceptr dp = 0;
-  void **block = NULL;
-
-  if (mapnum > 0)
-    {
-      size_t s = mapnum * sizeof (void *);
-      block = (void **) GOMP_PLUGIN_malloc (2 * sizeof (void *) + s);
-      hp = block + 2;
-      for (int i = 0; i < mapnum; i++)
-       hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
-      CUDA_CALL_ASSERT (cuMemAlloc, &dp, s);
-      if (profiling_p)
-       goacc_profiling_acc_ev_alloc (thr, (void *) dp, s);
-    }
-
-  /* Copy the (device) pointers to arguments to the device (dp and hp might in
-     fact have the same value on a unified-memory system).  */
-  if (mapnum > 0)
-    {
-      if (profiling_p)
-       {
-         prof_info->event_type = acc_ev_enqueue_upload_start;
-
-         data_event_info.data_event.event_type = prof_info->event_type;
-         data_event_info.data_event.valid_bytes
-           = _ACC_DATA_EVENT_INFO_VALID_BYTES;
-         data_event_info.data_event.parent_construct
-           = acc_construct_parallel;
-         data_event_info.data_event.implicit = 1; /* Always implicit.  */
-         data_event_info.data_event.tool_info = NULL;
-         data_event_info.data_event.var_name = NULL;
-         data_event_info.data_event.bytes = mapnum * sizeof (void *);
-         data_event_info.data_event.host_ptr = hp;
-         data_event_info.data_event.device_ptr = (const void *) dp;
-
-         api_info->device_api = acc_device_api_cuda;
-
-         GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
-                                               api_info);
-       }
-
-      CUDA_CALL_ASSERT (cuMemcpyHtoDAsync, dp, (void *) hp,
-                       mapnum * sizeof (void *), aq->cuda_stream);
-      block[0] = (void *) dp;
-
-      struct nvptx_thread *nvthd =
-       (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
-      block[1] = (void *) nvthd->ptx_dev;
-
-      if (profiling_p)
-       {
-         prof_info->event_type = acc_ev_enqueue_upload_end;
-         data_event_info.data_event.event_type = prof_info->event_type;
-         GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
-                                               api_info);
-       }
-    }
-
-  nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
-             dp, aq->cuda_stream);
-
-  if (mapnum > 0)
-    GOMP_OFFLOAD_openacc_async_queue_callback (aq, cuda_free_argmem, block);
+  nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, aq->cuda_stream);
 }
 
 void *

[PATCH, nvptx] Expand OpenACC child function arguments to use CUDA params space

Reply via email to