On Fri, Nov 13, 2015 at 04:11:50PM +0100, Jakub Jelinek wrote: > On Fri, Nov 13, 2015 at 11:18:41AM +0100, Jakub Jelinek wrote: > > For the offloading case, I actually see a problematic spot, namely that > > GOMP_PLUGIN_target_task_completion could finish too early, and get the > > task_lock before the thread that run the gomp_target_task_fn doing map_vars > > + async_run for it. Bet I need to add further ttask state kinds and deal > > with that case (so GOMP_PLUGIN_target_task_completion would just take the > > task lock and tweak ttask state if it has not been added to the queues > > yet). > > Plus I think I want to improve the case where we are not waiting, in > > gomp_create_target_task if not waiting for dependencies actually schedule > > manually the gomp_target_task_fn. > > These two have been resolved, plus target-34.c issue resolved too (the bug > was that I've been too lazy and just put target-33.c test into #pragma omp > parallel #pragma omp single, but that is invalid OpenMP, as single is a > worksharing region and #pragma omp barrier may not be encountered in such a > region. Fixed by rewriting the testcase. > > So here is a full patch that passes for me both non-offloading and > offloading, OMP_NUM_THREADS=16 (implicit on my box) as well as > OMP_NUM_THREADS=1 (explicit). I've incorporated your incremental patch. >
I have committed the following patch to the hsa branch to implement GOMP_OFFLOAD_async_run. Tests target-33.c and target-34.c pass right away. I also do not have any usleep on HSA, so I only ran target-32.c manually after replacing the usleeps with some pointless busy looping. During the testing, I have come accross quite a few places where libgomp has to treat shared memory devices like it treats host, and so I added that to the patch too. The hunk in gomp_create_target_task should have been in the previous merge from trunk but I forgot to add it then. Any feedback welcome, Martin 2015-11-23 Martin Jambor <mjam...@suse.cz> libgomp/ * plugin/plugin-hsa.c (async_run_info): New structure. (run_kernel_asynchronously): New function. (GOMP_OFFLOAD_async_run): New implementation. * target.c (GOMP_target_data_ext): Handle shared memory devices like the host. (GOMP_target_update): Likewise. (GOMP_target_update_ext): Likewise. (GOMP_target_enter_exit_data): Likewise. (omp_target_alloc): Likewise. (omp_target_free): Likewise. (omp_target_memcpy): Likewise. (omp_target_memcpy_rect): Likewise. * task.c (gomp_create_target_task): Fill in args field of ttask. --- libgomp/plugin/plugin-hsa.c | 61 ++++++++++++++++++++++++++++++++++++++++----- libgomp/target.c | 30 ++++++++++++++-------- libgomp/task.c | 1 + 3 files changed, 76 insertions(+), 16 deletions(-) diff --git a/libgomp/plugin/plugin-hsa.c b/libgomp/plugin/plugin-hsa.c index 40dbcde..72f5bdd 100644 --- a/libgomp/plugin/plugin-hsa.c +++ b/libgomp/plugin/plugin-hsa.c @@ -1127,9 +1127,9 @@ failure: return false; } -/* Part of the libgomp plugin interface. Run a kernel on a device N and pass - the it an array of pointers in VARS as a parameter. The kernel is - identified by FN_PTR which must point to a kernel_info structure. */ +/* Part of the libgomp plugin interface. Run a kernel on device N and pass it + an array of pointers in VARS as a parameter. The kernel is identified by + FN_PTR which must point to a kernel_info structure. */ void GOMP_OFFLOAD_run (int n, void *fn_ptr, void *vars, void** args) @@ -1237,13 +1237,62 @@ GOMP_OFFLOAD_run (int n, void *fn_ptr, void *vars, void** args) GOMP_PLUGIN_fatal ("Unable to unlock an HSA agent rwlock"); } +/* Information to be passed to a thread running a kernel asycnronously. */ + +struct async_run_info +{ + int device; + void *tgt_fn; + void *tgt_vars; + void **args; + void *async_data; +}; + +/* Thread routine to run a kernel asynchronously. */ + +static void * +run_kernel_asynchronously (void *thread_arg) +{ + struct async_run_info *info = (struct async_run_info *) thread_arg; + int device = info->device; + void *tgt_fn = info->tgt_fn; + void *tgt_vars = info->tgt_vars; + void **args = info->args; + void *async_data = info->async_data; + + free (info); + GOMP_OFFLOAD_run (device, tgt_fn, tgt_vars, args); + GOMP_PLUGIN_target_task_completion (async_data); + return NULL; +} + +/* Part of the libgomp plugin interface. Run a kernel like GOMP_OFFLOAD_run + does, but asynchronously and call GOMP_PLUGIN_target_task_completion when it + has finished. */ + void GOMP_OFFLOAD_async_run (int device, void *tgt_fn, void *tgt_vars, void **args, void *async_data) { - /* FIXME: Implement. */ - GOMP_PLUGIN_fatal ("Support for HSA does not yet implement asynchronous " - "execution. "); + pthread_t pt; + struct async_run_info *info; + HSA_DEBUG ("GOMP_OFFLOAD_async_run invoked\n") + info = GOMP_PLUGIN_malloc (sizeof (struct async_run_info)); + + info->device = device; + info->tgt_fn = tgt_fn; + info->tgt_vars = tgt_vars; + info->args = args; + info->async_data = async_data; + + int err = pthread_create (&pt, NULL, &run_kernel_asynchronously, info); + if (err != 0) + GOMP_PLUGIN_fatal ("HSA asynchronous thread creation failed: %s", + strerror (err)); + err = pthread_detach (pt); + if (err != 0) + GOMP_PLUGIN_fatal ("Failed to detach a thread to run HRA kernel " + "asynchronously: %s", strerror (err)); } /* Deinitialize all information associated with MODULE and kernels within diff --git a/libgomp/target.c b/libgomp/target.c index a771d7d..f8a9803 100644 --- a/libgomp/target.c +++ b/libgomp/target.c @@ -1527,7 +1527,8 @@ GOMP_target_data_ext (int device, size_t mapnum, void **hostaddrs, struct gomp_device_descr *devicep = resolve_device (device); if (devicep == NULL - || !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)) + || !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400) + || devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) return gomp_target_data_fallback (); struct target_mem_desc *tgt @@ -1557,7 +1558,8 @@ GOMP_target_update (int device, const void *unused, size_t mapnum, struct gomp_device_descr *devicep = resolve_device (device); if (devicep == NULL - || !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)) + || !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400) + || devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) return; gomp_update (devicep, mapnum, hostaddrs, sizes, kinds, false); @@ -1608,7 +1610,8 @@ GOMP_target_update_ext (int device, size_t mapnum, void **hostaddrs, } if (devicep == NULL - || !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)) + || !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400) + || devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) return; struct gomp_thread *thr = gomp_thread (); @@ -1730,7 +1733,8 @@ GOMP_target_enter_exit_data (int device, size_t mapnum, void **hostaddrs, } if (devicep == NULL - || !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)) + || !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400) + || devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) return; struct gomp_thread *thr = gomp_thread (); @@ -1861,7 +1865,8 @@ omp_target_alloc (size_t size, int device_num) if (devicep == NULL) return NULL; - if (!(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)) + if (!(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400) + || devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) return malloc (size); gomp_mutex_lock (&devicep->lock); @@ -1889,7 +1894,8 @@ omp_target_free (void *device_ptr, int device_num) if (devicep == NULL) return; - if (!(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)) + if (!(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400) + || devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) { free (device_ptr); return; @@ -1946,7 +1952,8 @@ omp_target_memcpy (void *dst, void *src, size_t length, size_t dst_offset, if (dst_devicep == NULL) return EINVAL; - if (!(dst_devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)) + if (!(dst_devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400) + || dst_devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) dst_devicep = NULL; } if (src_device_num != GOMP_DEVICE_HOST_FALLBACK) @@ -1958,7 +1965,8 @@ omp_target_memcpy (void *dst, void *src, size_t length, size_t dst_offset, if (src_devicep == NULL) return EINVAL; - if (!(src_devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)) + if (!(src_devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400) + || src_devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) src_devicep = NULL; } if (src_devicep == NULL && dst_devicep == NULL) @@ -2088,7 +2096,8 @@ omp_target_memcpy_rect (void *dst, void *src, size_t element_size, if (dst_devicep == NULL) return EINVAL; - if (!(dst_devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)) + if (!(dst_devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400) + || dst_devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) dst_devicep = NULL; } if (src_device_num != GOMP_DEVICE_HOST_FALLBACK) @@ -2100,7 +2109,8 @@ omp_target_memcpy_rect (void *dst, void *src, size_t element_size, if (src_devicep == NULL) return EINVAL; - if (!(src_devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)) + if (!(src_devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400) + || src_devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) src_devicep = NULL; } diff --git a/libgomp/task.c b/libgomp/task.c index 838ef1a..18d40cf 100644 --- a/libgomp/task.c +++ b/libgomp/task.c @@ -652,6 +652,7 @@ gomp_create_target_task (struct gomp_device_descr *devicep, ttask->devicep = devicep; ttask->fn = fn; ttask->mapnum = mapnum; + ttask->args = args; memcpy (ttask->hostaddrs, hostaddrs, mapnum * sizeof (void *)); ttask->sizes = (size_t *) &ttask->hostaddrs[mapnum]; memcpy (ttask->sizes, sizes, mapnum * sizeof (size_t)); -- 2.6.0