Hi Chung-Lin! While we're all waiting for Tom to comment on this ;-) -- here's another item I realized:
On 2019-09-10T19:41:59+0800, Chung-Lin Tang <chunglin_t...@mentor.com> wrote: > The libgomp nvptx plugin changes are also quite contained, with lots of > now unneeded [...] code deleted (since we no longer first cuAlloc a > buffer for the argument record before cuLaunchKernel) It would be nice ;-) -- but unless I'm confused, it's not that simple: we either have to reject (force host-fallback execution) or keep supporting "old-style" nvptx offloading code: new-libgomp has to continue to work with nvptx offloading code once generated by old-GCC. Possibly even a mixture of old and new nvptx offloading code, if libraries are involved, huh! I have not completely thought that through, but I suppose this could be addressed by adding a flag to the 'struct nvptx_fn' (or similar) that's synthesized by nvptx 'mkoffload'? Maybe if fact the 'enum id_map_flag' machinery that I once added for 'Un-parallelized OpenACC kernels constructs with nvptx offloading: "avoid offloading"'? (That's part of og8 commit 2d42fbf7e989e4bb76727b32ef11deb5845d5ab1 -- not present on og9, huh?!) The 'enum id_map_flag' machinery serves the purpose of transporting information from the offload compiler to libgomp, which seems what's needed here? (But please verify.) For reference, your proposed changes: > --- libgomp/plugin/plugin-nvptx.c (revision 275493) > +++ libgomp/plugin/plugin-nvptx.c (working copy) > @@ -696,16 +696,24 @@ link_ptx (CUmodule *module, const struct targ_ptx_ > > static void > nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, > - unsigned *dims, void *targ_mem_desc, > - CUdeviceptr dp, CUstream stream) > + unsigned *dims, CUstream stream) > { > struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn; > CUfunction function; > int i; > - void *kargs[1]; > struct nvptx_thread *nvthd = nvptx_thread (); > int warp_size = nvthd->ptx_dev->warp_size; > + void **kernel_args = NULL; > > + GOMP_PLUGIN_debug (0, "prepare mappings (mapnum: %u)\n", (unsigned) > mapnum); > + > + if (mapnum > 0) > + { > + kernel_args = alloca (mapnum * sizeof (void *)); > + for (int i = 0; i < mapnum; i++) > + kernel_args[i] = (devaddrs[i] ? &devaddrs[i] : &hostaddrs[i]); > + } > + > function = targ_fn->fn; > > /* Initialize the launch dimensions. Typically this is constant, > @@ -937,11 +945,10 @@ nvptx_exec (void (*fn), size_t mapnum, void **host > api_info); > } > > - kargs[0] = &dp; > CUDA_CALL_ASSERT (cuLaunchKernel, function, > dims[GOMP_DIM_GANG], 1, 1, > dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1, > - 0, stream, kargs, 0); > + 0, stream, kernel_args, 0); > > if (profiling_p) > { > @@ -1350,67 +1357,8 @@ GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), si > void **hostaddrs, void **devaddrs, > unsigned *dims, void *targ_mem_desc) > { > - GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__); > + nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, NULL); > > - struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread (); > - acc_prof_info *prof_info = thr->prof_info; > - acc_event_info data_event_info; > - acc_api_info *api_info = thr->api_info; > - bool profiling_p = __builtin_expect (prof_info != NULL, false); > - > - void **hp = NULL; > - CUdeviceptr dp = 0; > - > - if (mapnum > 0) > - { > - size_t s = mapnum * sizeof (void *); > - hp = alloca (s); > - for (int i = 0; i < mapnum; i++) > - hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]); > - CUDA_CALL_ASSERT (cuMemAlloc, &dp, s); > - if (profiling_p) > - goacc_profiling_acc_ev_alloc (thr, (void *) dp, s); > - } > - > - /* Copy the (device) pointers to arguments to the device (dp and hp might > in > - fact have the same value on a unified-memory system). */ > - if (mapnum > 0) > - { > - if (profiling_p) > - { > - prof_info->event_type = acc_ev_enqueue_upload_start; > - > - data_event_info.data_event.event_type = prof_info->event_type; > - data_event_info.data_event.valid_bytes > - = _ACC_DATA_EVENT_INFO_VALID_BYTES; > - data_event_info.data_event.parent_construct > - = acc_construct_parallel; > - data_event_info.data_event.implicit = 1; /* Always implicit. */ > - data_event_info.data_event.tool_info = NULL; > - data_event_info.data_event.var_name = NULL; > - data_event_info.data_event.bytes = mapnum * sizeof (void *); > - data_event_info.data_event.host_ptr = hp; > - data_event_info.data_event.device_ptr = (const void *) dp; > - > - api_info->device_api = acc_device_api_cuda; > - > - GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, > - api_info); > - } > - CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, (void *) hp, > - mapnum * sizeof (void *)); > - if (profiling_p) > - { > - prof_info->event_type = acc_ev_enqueue_upload_end; > - data_event_info.data_event.event_type = prof_info->event_type; > - GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, > - api_info); > - } > - } > - > - nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc, > - dp, NULL); > - > CUresult r = CUDA_CALL_NOCHECK (cuStreamSynchronize, NULL); > const char *maybe_abort_msg = "(perhaps abort was called)"; > if (r == CUDA_ERROR_LAUNCH_FAILED) > @@ -1418,20 +1366,8 @@ GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), si > maybe_abort_msg); > else if (r != CUDA_SUCCESS) > GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r)); > - > - CUDA_CALL_ASSERT (cuMemFree, dp); > - if (profiling_p) > - goacc_profiling_acc_ev_free (thr, (void *) dp); > } > > -static void > -cuda_free_argmem (void *ptr) > -{ > - void **block = (void **) ptr; > - nvptx_free (block[0], (struct ptx_device *) block[1]); > - free (block); > -} > - > void > GOMP_OFFLOAD_openacc_async_exec (void (*fn) (void *), size_t mapnum, > void **hostaddrs, void **devaddrs, > @@ -1438,78 +1374,7 @@ GOMP_OFFLOAD_openacc_async_exec (void (*fn) (void > unsigned *dims, void *targ_mem_desc, > struct goacc_asyncqueue *aq) > { > - GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__); > - > - struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread (); > - acc_prof_info *prof_info = thr->prof_info; > - acc_event_info data_event_info; > - acc_api_info *api_info = thr->api_info; > - bool profiling_p = __builtin_expect (prof_info != NULL, false); > - > - void **hp = NULL; > - CUdeviceptr dp = 0; > - void **block = NULL; > - > - if (mapnum > 0) > - { > - size_t s = mapnum * sizeof (void *); > - block = (void **) GOMP_PLUGIN_malloc (2 * sizeof (void *) + s); > - hp = block + 2; > - for (int i = 0; i < mapnum; i++) > - hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]); > - CUDA_CALL_ASSERT (cuMemAlloc, &dp, s); > - if (profiling_p) > - goacc_profiling_acc_ev_alloc (thr, (void *) dp, s); > - } > - > - /* Copy the (device) pointers to arguments to the device (dp and hp might > in > - fact have the same value on a unified-memory system). */ > - if (mapnum > 0) > - { > - if (profiling_p) > - { > - prof_info->event_type = acc_ev_enqueue_upload_start; > - > - data_event_info.data_event.event_type = prof_info->event_type; > - data_event_info.data_event.valid_bytes > - = _ACC_DATA_EVENT_INFO_VALID_BYTES; > - data_event_info.data_event.parent_construct > - = acc_construct_parallel; > - data_event_info.data_event.implicit = 1; /* Always implicit. */ > - data_event_info.data_event.tool_info = NULL; > - data_event_info.data_event.var_name = NULL; > - data_event_info.data_event.bytes = mapnum * sizeof (void *); > - data_event_info.data_event.host_ptr = hp; > - data_event_info.data_event.device_ptr = (const void *) dp; > - > - api_info->device_api = acc_device_api_cuda; > - > - GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, > - api_info); > - } > - > - CUDA_CALL_ASSERT (cuMemcpyHtoDAsync, dp, (void *) hp, > - mapnum * sizeof (void *), aq->cuda_stream); > - block[0] = (void *) dp; > - > - struct nvptx_thread *nvthd = > - (struct nvptx_thread *) GOMP_PLUGIN_acc_thread (); > - block[1] = (void *) nvthd->ptx_dev; > - > - if (profiling_p) > - { > - prof_info->event_type = acc_ev_enqueue_upload_end; > - data_event_info.data_event.event_type = prof_info->event_type; > - GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, > - api_info); > - } > - } > - > - nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc, > - dp, aq->cuda_stream); > - > - if (mapnum > 0) > - GOMP_OFFLOAD_openacc_async_queue_callback (aq, cuda_free_argmem, block); > + nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, aq->cuda_stream); > } > > void * Grüße Thomas
signature.asc
Description: PGP signature