Re: [PATCH, nvptx] Expand OpenACC child function arguments to use CUDA params space

Thomas Schwinge Tue, 08 Oct 2019 07:06:13 -0700

Hi Chung-Lin!

While we're all waiting for Tom to comment on this ;-) -- here's another
item I realized:


On 2019-09-10T19:41:59+0800, Chung-Lin Tang <chunglin_t...@mentor.com> wrote:
> The libgomp nvptx plugin changes are also quite contained, with lots of
> now unneeded [...] code deleted (since we no longer first cuAlloc a
> buffer for the argument record before cuLaunchKernel)

It would be nice ;-) -- but unless I'm confused, it's not that simple: we
either have to reject (force host-fallback execution) or keep supporting
"old-style" nvptx offloading code: new-libgomp has to continue to work
with nvptx offloading code once generated by old-GCC.  Possibly even a
mixture of old and new nvptx offloading code, if libraries are involved,
huh!

I have not completely thought that through, but I suppose this could be
addressed by adding a flag to the 'struct nvptx_fn' (or similar) that's
synthesized by nvptx 'mkoffload'?

Maybe if fact the 'enum id_map_flag' machinery that I once added for
'Un-parallelized OpenACC kernels constructs with nvptx offloading: "avoid
offloading"'?  (That's part of og8 commit
2d42fbf7e989e4bb76727b32ef11deb5845d5ab1 -- not present on og9, huh?!)
The 'enum id_map_flag' machinery serves the purpose of transporting
information from the offload compiler to libgomp, which seems what's
needed here?  (But please verify.)

For reference, your proposed changes:

> --- libgomp/plugin/plugin-nvptx.c     (revision 275493)
> +++ libgomp/plugin/plugin-nvptx.c     (working copy)
> @@ -696,16 +696,24 @@ link_ptx (CUmodule *module, const struct targ_ptx_
>  
>  static void
>  nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
> -         unsigned *dims, void *targ_mem_desc,
> -         CUdeviceptr dp, CUstream stream)
> +         unsigned *dims, CUstream stream)
>  {
>    struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
>    CUfunction function;
>    int i;
> -  void *kargs[1];
>    struct nvptx_thread *nvthd = nvptx_thread ();
>    int warp_size = nvthd->ptx_dev->warp_size;
> +  void **kernel_args = NULL;
>  
> +  GOMP_PLUGIN_debug (0, "prepare mappings (mapnum: %u)\n", (unsigned) 
> mapnum);
> +
> +  if (mapnum > 0)
> +    {
> +      kernel_args = alloca (mapnum * sizeof (void *));
> +      for (int i = 0; i < mapnum; i++)
> +     kernel_args[i] = (devaddrs[i] ? &devaddrs[i] : &hostaddrs[i]);
> +    }
> +  
>    function = targ_fn->fn;
>  
>    /* Initialize the launch dimensions.  Typically this is constant,
> @@ -937,11 +945,10 @@ nvptx_exec (void (*fn), size_t mapnum, void **host
>                                           api_info);
>      }
>  
> -  kargs[0] = &dp;
>    CUDA_CALL_ASSERT (cuLaunchKernel, function,
>                   dims[GOMP_DIM_GANG], 1, 1,
>                   dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
> -                 0, stream, kargs, 0);
> +                 0, stream, kernel_args, 0);
>  
>    if (profiling_p)
>      {
> @@ -1350,67 +1357,8 @@ GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), si
>                          void **hostaddrs, void **devaddrs,
>                          unsigned *dims, void *targ_mem_desc)
>  {
> -  GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
> +  nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, NULL);
>  
> -  struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
> -  acc_prof_info *prof_info = thr->prof_info;
> -  acc_event_info data_event_info;
> -  acc_api_info *api_info = thr->api_info;
> -  bool profiling_p = __builtin_expect (prof_info != NULL, false);
> -
> -  void **hp = NULL;
> -  CUdeviceptr dp = 0;
> -
> -  if (mapnum > 0)
> -    {
> -      size_t s = mapnum * sizeof (void *);
> -      hp = alloca (s);
> -      for (int i = 0; i < mapnum; i++)
> -     hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
> -      CUDA_CALL_ASSERT (cuMemAlloc, &dp, s);
> -      if (profiling_p)
> -     goacc_profiling_acc_ev_alloc (thr, (void *) dp, s);
> -    }
> -
> -  /* Copy the (device) pointers to arguments to the device (dp and hp might 
> in
> -     fact have the same value on a unified-memory system).  */
> -  if (mapnum > 0)
> -    {
> -      if (profiling_p)
> -     {
> -       prof_info->event_type = acc_ev_enqueue_upload_start;
> -
> -       data_event_info.data_event.event_type = prof_info->event_type;
> -       data_event_info.data_event.valid_bytes
> -         = _ACC_DATA_EVENT_INFO_VALID_BYTES;
> -       data_event_info.data_event.parent_construct
> -         = acc_construct_parallel;
> -       data_event_info.data_event.implicit = 1; /* Always implicit.  */
> -       data_event_info.data_event.tool_info = NULL;
> -       data_event_info.data_event.var_name = NULL;
> -       data_event_info.data_event.bytes = mapnum * sizeof (void *);
> -       data_event_info.data_event.host_ptr = hp;
> -       data_event_info.data_event.device_ptr = (const void *) dp;
> -
> -       api_info->device_api = acc_device_api_cuda;
> -
> -       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
> -                                             api_info);
> -     }
> -      CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, (void *) hp,
> -                     mapnum * sizeof (void *));
> -      if (profiling_p)
> -     {
> -       prof_info->event_type = acc_ev_enqueue_upload_end;
> -       data_event_info.data_event.event_type = prof_info->event_type;
> -       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
> -                                             api_info);
> -     }
> -    }
> -
> -  nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
> -           dp, NULL);
> -
>    CUresult r = CUDA_CALL_NOCHECK (cuStreamSynchronize, NULL);
>    const char *maybe_abort_msg = "(perhaps abort was called)";
>    if (r == CUDA_ERROR_LAUNCH_FAILED)
> @@ -1418,20 +1366,8 @@ GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), si
>                      maybe_abort_msg);
>    else if (r != CUDA_SUCCESS)
>      GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
> -
> -  CUDA_CALL_ASSERT (cuMemFree, dp);
> -  if (profiling_p)
> -    goacc_profiling_acc_ev_free (thr, (void *) dp);
>  }
>  
> -static void
> -cuda_free_argmem (void *ptr)
> -{
> -  void **block = (void **) ptr;
> -  nvptx_free (block[0], (struct ptx_device *) block[1]);
> -  free (block);
> -}
> -
>  void
>  GOMP_OFFLOAD_openacc_async_exec (void (*fn) (void *), size_t mapnum,
>                                void **hostaddrs, void **devaddrs,
> @@ -1438,78 +1374,7 @@ GOMP_OFFLOAD_openacc_async_exec (void (*fn) (void
>                                unsigned *dims, void *targ_mem_desc,
>                                struct goacc_asyncqueue *aq)
>  {
> -  GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
> -
> -  struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
> -  acc_prof_info *prof_info = thr->prof_info;
> -  acc_event_info data_event_info;
> -  acc_api_info *api_info = thr->api_info;
> -  bool profiling_p = __builtin_expect (prof_info != NULL, false);
> -
> -  void **hp = NULL;
> -  CUdeviceptr dp = 0;
> -  void **block = NULL;
> -
> -  if (mapnum > 0)
> -    {
> -      size_t s = mapnum * sizeof (void *);
> -      block = (void **) GOMP_PLUGIN_malloc (2 * sizeof (void *) + s);
> -      hp = block + 2;
> -      for (int i = 0; i < mapnum; i++)
> -     hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
> -      CUDA_CALL_ASSERT (cuMemAlloc, &dp, s);
> -      if (profiling_p)
> -     goacc_profiling_acc_ev_alloc (thr, (void *) dp, s);
> -    }
> -
> -  /* Copy the (device) pointers to arguments to the device (dp and hp might 
> in
> -     fact have the same value on a unified-memory system).  */
> -  if (mapnum > 0)
> -    {
> -      if (profiling_p)
> -     {
> -       prof_info->event_type = acc_ev_enqueue_upload_start;
> -
> -       data_event_info.data_event.event_type = prof_info->event_type;
> -       data_event_info.data_event.valid_bytes
> -         = _ACC_DATA_EVENT_INFO_VALID_BYTES;
> -       data_event_info.data_event.parent_construct
> -         = acc_construct_parallel;
> -       data_event_info.data_event.implicit = 1; /* Always implicit.  */
> -       data_event_info.data_event.tool_info = NULL;
> -       data_event_info.data_event.var_name = NULL;
> -       data_event_info.data_event.bytes = mapnum * sizeof (void *);
> -       data_event_info.data_event.host_ptr = hp;
> -       data_event_info.data_event.device_ptr = (const void *) dp;
> -
> -       api_info->device_api = acc_device_api_cuda;
> -
> -       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
> -                                             api_info);
> -     }
> -
> -      CUDA_CALL_ASSERT (cuMemcpyHtoDAsync, dp, (void *) hp,
> -                     mapnum * sizeof (void *), aq->cuda_stream);
> -      block[0] = (void *) dp;
> -
> -      struct nvptx_thread *nvthd =
> -     (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
> -      block[1] = (void *) nvthd->ptx_dev;
> -
> -      if (profiling_p)
> -     {
> -       prof_info->event_type = acc_ev_enqueue_upload_end;
> -       data_event_info.data_event.event_type = prof_info->event_type;
> -       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
> -                                             api_info);
> -     }
> -    }
> -
> -  nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
> -           dp, aq->cuda_stream);
> -
> -  if (mapnum > 0)
> -    GOMP_OFFLOAD_openacc_async_queue_callback (aq, cuda_free_argmem, block);
> +  nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, aq->cuda_stream);
>  }
>  
>  void *


Grüße
 Thomas

signature.asc
Description: PGP signature

Re: [PATCH, nvptx] Expand OpenACC child function arguments to use CUDA params space

Reply via email to