Re: libgomp: Simplify OpenMP reverse offload host <-> device memory copy implementation (was: [Patch] libgomp/nvptx: Prepare for reverse-offload callback handling)
On 28.04.23 11:31, Thomas Schwinge wrote: On 2023-04-28T10:48:31+0200, Tobias Burnus wrote: I don't think that just calling "exit (EXIT_FAILURE);" is the the proper way The point is, when we run into such an 'exit', we've already issued an error (in the plugin, via 'GOMP_PLUGIN_fatal'), you meant: GOMP_PLUGIN_error. and then (to replicate what 'GOMP_PLUGIN_fatal'/'gomp_fatal' do) we just need to 'exit' -- after unlocking. The latter is the reason why we can't just do this: – I think that should be GOMP_PLUGIN_fatal in the plugin and gomp_fatal in target.c. ..., because we'd dead-lock due to 'atexit' shutdown of devices etc., while still having devices etc. locked. (Resolving all this differently/"properly" is for another day.) → https://gcc.gnu.org/PR109664 Otherwise, it LGTM. Thanks. OK to push then, given the rationale above? OK. Tobias - Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht München, HRB 106955
Re: libgomp: Simplify OpenMP reverse offload host <-> device memory copy implementation (was: [Patch] libgomp/nvptx: Prepare for reverse-offload callback handling)
Hi Tobias! On 2023-04-28T10:48:31+0200, Tobias Burnus wrote: > On 21.03.23 16:53, Thomas Schwinge wrote: >> On 2022-08-26T11:07:28+0200, Tobias Burnus >> wrote: >>> This patch adds initial [OpenMP reverse offload] support for nvptx. >>> CUDA does lockup when trying to copy data from the currently running >>> stream; hence, a new stream is generated to do the memory copying. >> As part of other work, where I had to touch those special code paths, I >> found that we may reduce complexity a little bit "by using the existing >> 'goacc_asyncqueue' instead of re-coding parts of it". OK to push >> "libgomp: Simplify OpenMP reverse offload host <-> device memory copy >> implementation" >> (still testing), see attached? > > I don't think that just calling "exit (EXIT_FAILURE);" is the the proper > way The point is, when we run into such an 'exit', we've already issued an error (in the plugin, via 'GOMP_PLUGIN_fatal'), and then (to replicate what 'GOMP_PLUGIN_fatal'/'gomp_fatal' do) we just need to 'exit' -- after unlocking. The latter is the reason why we can't just do this: > – I think that should be GOMP_PLUGIN_fatal in the plugin and > gomp_fatal in target.c. ..., because we'd dead-lock due to 'atexit' shutdown of devices etc., while still having devices etc. locked. (Resolving all this differently/"properly" is for another day.) > Otherwise, it LGTM. Thanks. OK to push then, given the rationale above? Grüße Thomas >> Subject: [PATCH] libgomp: Simplify OpenMP reverse offload host <-> device >> memory copy implementation >> >> ... by using the existing 'goacc_asyncqueue' instead of re-coding parts of >> it. >> >> Follow-up to commit 131d18e928a3ea1ab2d3bf61aa92d68a8a254609 >> "libgomp/nvptx: Prepare for reverse-offload callback handling", >> and commit ea4b23d9c82d9be3b982c3519fe5e8e9d833a6a8 >> "libgomp: Handle OpenMP's reverse offloads". >> >> libgomp/ >> * target.c (gomp_target_rev): Instead of 'dev_to_host_cpy', >> 'host_to_dev_cpy', 'token', take a single 'goacc_asyncqueue'. >> * libgomp.h (gomp_target_rev): Adjust. >> * libgomp-plugin.c (GOMP_PLUGIN_target_rev): Adjust. >> * libgomp-plugin.h (GOMP_PLUGIN_target_rev): Adjust. >> * plugin/plugin-gcn.c (process_reverse_offload): Adjust. >> * plugin/plugin-nvptx.c (rev_off_dev_to_host_cpy) >> (rev_off_host_to_dev_cpy): Remove. >> (GOMP_OFFLOAD_run): Adjust. >> --- >> libgomp/libgomp-plugin.c | 7 +-- >> libgomp/libgomp-plugin.h | 6 +- >> libgomp/libgomp.h | 5 +- >> libgomp/plugin/plugin-gcn.c | 2 +- >> libgomp/plugin/plugin-nvptx.c | 77 ++--- >> libgomp/target.c | 102 +++--- >> 6 files changed, 96 insertions(+), 103 deletions(-) >> >> diff --git a/libgomp/libgomp-plugin.c b/libgomp/libgomp-plugin.c >> index 27e7c94ba9b..d696515eeb6 100644 >> --- a/libgomp/libgomp-plugin.c >> +++ b/libgomp/libgomp-plugin.c >> @@ -82,11 +82,8 @@ GOMP_PLUGIN_fatal (const char *msg, ...) >> void >> GOMP_PLUGIN_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t >> devaddrs_ptr, >> uint64_t sizes_ptr, uint64_t kinds_ptr, int dev_num, >> - void (*dev_to_host_cpy) (void *, const void *, size_t, >> - void *), >> - void (*host_to_dev_cpy) (void *, const void *, size_t, >> - void *), void *token) >> + struct goacc_asyncqueue *aq) >> { >> gomp_target_rev (fn_ptr, mapnum, devaddrs_ptr, sizes_ptr, kinds_ptr, >> dev_num, >> -dev_to_host_cpy, host_to_dev_cpy, token); >> +aq); >> } >> diff --git a/libgomp/libgomp-plugin.h b/libgomp/libgomp-plugin.h >> index 28267f75f7a..42ee3d6c7f9 100644 >> --- a/libgomp/libgomp-plugin.h >> +++ b/libgomp/libgomp-plugin.h >> @@ -121,11 +121,7 @@ extern void GOMP_PLUGIN_fatal (const char *, ...) >> __attribute__ ((noreturn, format (printf, 1, 2))); >> >> extern void GOMP_PLUGIN_target_rev (uint64_t, uint64_t, uint64_t, uint64_t, >> - uint64_t, int, >> - void (*) (void *, const void *, size_t, >> - void *), >> - void (*) (void *, const void *, size_t, >> - void *), void *); >> + uint64_t, int, struct goacc_asyncqueue *); >> >> /* Prototypes for functions implemented by libgomp plugins. */ >> extern const char *GOMP_OFFLOAD_get_name (void); >> diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h >> index ba8fe348aba..4d2bfab4b71 100644 >> --- a/libgomp/libgomp.h >> +++ b/libgomp/libgomp.h >> @@ -1130,10 +1130,7 @@ extern void gomp_init_targets_once (void); >> extern int gomp_get_num_devices (void); >> extern bool gomp_target_task_fn (void
Re: libgomp: Simplify OpenMP reverse offload host <-> device memory copy implementation (was: [Patch] libgomp/nvptx: Prepare for reverse-offload callback handling)
Hi Thomas, On 21.03.23 16:53, Thomas Schwinge wrote: On 2022-08-26T11:07:28+0200, Tobias Burnus wrote: This patch adds initial [OpenMP reverse offload] support for nvptx. CUDA does lockup when trying to copy data from the currently running stream; hence, a new stream is generated to do the memory copying. As part of other work, where I had to touch those special code paths, I found that we may reduce complexity a little bit "by using the existing 'goacc_asyncqueue' instead of re-coding parts of it". OK to push "libgomp: Simplify OpenMP reverse offload host <-> device memory copy implementation" (still testing), see attached? I don't think that just calling "exit (EXIT_FAILURE);" is the the proper way – I think that should be GOMP_PLUGIN_fatal in the plugin and gomp_fatal in target.c. Otherwise, it LGTM. Tobias Subject: [PATCH] libgomp: Simplify OpenMP reverse offload host <-> device memory copy implementation ... by using the existing 'goacc_asyncqueue' instead of re-coding parts of it. Follow-up to commit 131d18e928a3ea1ab2d3bf61aa92d68a8a254609 "libgomp/nvptx: Prepare for reverse-offload callback handling", and commit ea4b23d9c82d9be3b982c3519fe5e8e9d833a6a8 "libgomp: Handle OpenMP's reverse offloads". libgomp/ * target.c (gomp_target_rev): Instead of 'dev_to_host_cpy', 'host_to_dev_cpy', 'token', take a single 'goacc_asyncqueue'. * libgomp.h (gomp_target_rev): Adjust. * libgomp-plugin.c (GOMP_PLUGIN_target_rev): Adjust. * libgomp-plugin.h (GOMP_PLUGIN_target_rev): Adjust. * plugin/plugin-gcn.c (process_reverse_offload): Adjust. * plugin/plugin-nvptx.c (rev_off_dev_to_host_cpy) (rev_off_host_to_dev_cpy): Remove. (GOMP_OFFLOAD_run): Adjust. --- libgomp/libgomp-plugin.c | 7 +-- libgomp/libgomp-plugin.h | 6 +- libgomp/libgomp.h | 5 +- libgomp/plugin/plugin-gcn.c | 2 +- libgomp/plugin/plugin-nvptx.c | 77 ++--- libgomp/target.c | 102 +++--- 6 files changed, 96 insertions(+), 103 deletions(-) diff --git a/libgomp/libgomp-plugin.c b/libgomp/libgomp-plugin.c index 27e7c94ba9b..d696515eeb6 100644 --- a/libgomp/libgomp-plugin.c +++ b/libgomp/libgomp-plugin.c @@ -82,11 +82,8 @@ GOMP_PLUGIN_fatal (const char *msg, ...) void GOMP_PLUGIN_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr, uint64_t sizes_ptr, uint64_t kinds_ptr, int dev_num, - void (*dev_to_host_cpy) (void *, const void *, size_t, - void *), - void (*host_to_dev_cpy) (void *, const void *, size_t, - void *), void *token) + struct goacc_asyncqueue *aq) { gomp_target_rev (fn_ptr, mapnum, devaddrs_ptr, sizes_ptr, kinds_ptr, dev_num, -dev_to_host_cpy, host_to_dev_cpy, token); +aq); } diff --git a/libgomp/libgomp-plugin.h b/libgomp/libgomp-plugin.h index 28267f75f7a..42ee3d6c7f9 100644 --- a/libgomp/libgomp-plugin.h +++ b/libgomp/libgomp-plugin.h @@ -121,11 +121,7 @@ extern void GOMP_PLUGIN_fatal (const char *, ...) __attribute__ ((noreturn, format (printf, 1, 2))); extern void GOMP_PLUGIN_target_rev (uint64_t, uint64_t, uint64_t, uint64_t, - uint64_t, int, - void (*) (void *, const void *, size_t, - void *), - void (*) (void *, const void *, size_t, - void *), void *); + uint64_t, int, struct goacc_asyncqueue *); /* Prototypes for functions implemented by libgomp plugins. */ extern const char *GOMP_OFFLOAD_get_name (void); diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h index ba8fe348aba..4d2bfab4b71 100644 --- a/libgomp/libgomp.h +++ b/libgomp/libgomp.h @@ -1130,10 +1130,7 @@ extern void gomp_init_targets_once (void); extern int gomp_get_num_devices (void); extern bool gomp_target_task_fn (void *); extern void gomp_target_rev (uint64_t, uint64_t, uint64_t, uint64_t, uint64_t, - int, - void (*) (void *, const void *, size_t, void *), - void (*) (void *, const void *, size_t, void *), - void *); + int, struct goacc_asyncqueue *); /* Splay tree definitions. */ typedef struct splay_tree_node_s *splay_tree_node; diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c index 347803762eb..2181bf0235f 100644 --- a/libgomp/plugin/plugin-gcn.c +++ b/libgomp/plugin/plugin-gcn.c @@ -1949,7 +1949,7 @@ process_reverse_offload (uint64_t fn, uint64_t mapnum, uint64_t hostaddrs, { int dev_num = dev_num64; GOMP_PLUGIN_target_rev (fn, mapnum, hostaddrs, sizes, kinds, dev_num
[og12] libgomp: Simplify OpenMP reverse offload host <-> device memory copy implementation (was: [Patch] libgomp/nvptx: Prepare for reverse-offload callback handling)
Hi! On 2023-03-21T16:53:31+0100, I wrote: > On 2022-08-26T11:07:28+0200, Tobias Burnus wrote: >> This patch adds initial [OpenMP reverse offload] support for nvptx. > >> CUDA does lockup when trying to copy data from the currently running >> stream; hence, a new stream is generated to do the memory copying. > > As part of other work, where I had to touch those special code paths, I > found that we may reduce complexity a little bit "by using the existing > 'goacc_asyncqueue' instead of re-coding parts of it". OK to push > "libgomp: Simplify OpenMP reverse offload host <-> device memory copy > implementation" > (still testing), see attached? My other work now actually does depend on this; I've pushed to devel/omp/gcc-12 branch commit c276fa0616eb79ddc4d0245e775a841e84cbb7dd "libgomp: Simplify OpenMP reverse offload host <-> device memory copy implementation", see attached. May this also go into master branch still at this time, or "next year"? Grüße Thomas - Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht München, HRB 106955 >From c276fa0616eb79ddc4d0245e775a841e84cbb7dd Mon Sep 17 00:00:00 2001 From: Thomas Schwinge Date: Tue, 21 Mar 2023 16:14:16 +0100 Subject: [PATCH] libgomp: Simplify OpenMP reverse offload host <-> device memory copy implementation ... by using the existing 'goacc_asyncqueue' instead of re-coding parts of it. Follow-up to commit 131d18e928a3ea1ab2d3bf61aa92d68a8a254609 "libgomp/nvptx: Prepare for reverse-offload callback handling", and commit ea4b23d9c82d9be3b982c3519fe5e8e9d833a6a8 "libgomp: Handle OpenMP's reverse offloads". libgomp/ * target.c (gomp_target_rev): Instead of 'dev_to_host_cpy', 'host_to_dev_cpy', 'token', take a single 'goacc_asyncqueue'. * libgomp.h (gomp_target_rev): Adjust. * libgomp-plugin.c (GOMP_PLUGIN_target_rev): Adjust. * libgomp-plugin.h (GOMP_PLUGIN_target_rev): Adjust. * plugin/plugin-gcn.c (process_reverse_offload): Adjust. * plugin/plugin-nvptx.c (rev_off_dev_to_host_cpy) (rev_off_host_to_dev_cpy): Remove. (GOMP_OFFLOAD_run): Adjust. --- libgomp/ChangeLog.omp | 10 libgomp/libgomp-plugin.c | 7 +-- libgomp/libgomp-plugin.h | 6 +- libgomp/libgomp.h | 5 +- libgomp/plugin/plugin-gcn.c | 2 +- libgomp/plugin/plugin-nvptx.c | 77 ++--- libgomp/target.c | 102 +++--- 7 files changed, 106 insertions(+), 103 deletions(-) diff --git a/libgomp/ChangeLog.omp b/libgomp/ChangeLog.omp index 9360db66b03..fb352b39a6d 100644 --- a/libgomp/ChangeLog.omp +++ b/libgomp/ChangeLog.omp @@ -1,5 +1,15 @@ 2023-03-24 Thomas Schwinge + * target.c (gomp_target_rev): Instead of 'dev_to_host_cpy', + 'host_to_dev_cpy', 'token', take a single 'goacc_asyncqueue'. + * libgomp.h (gomp_target_rev): Adjust. + * libgomp-plugin.c (GOMP_PLUGIN_target_rev): Adjust. + * libgomp-plugin.h (GOMP_PLUGIN_target_rev): Adjust. + * plugin/plugin-gcn.c (process_reverse_offload): Adjust. + * plugin/plugin-nvptx.c (rev_off_dev_to_host_cpy) + (rev_off_host_to_dev_cpy): Remove. + (GOMP_OFFLOAD_run): Adjust. + * target.c (gomp_unmap_vars_internal): Queue splay-tree keys for removal after main loop. diff --git a/libgomp/libgomp-plugin.c b/libgomp/libgomp-plugin.c index 316de749f69..c76fa63da83 100644 --- a/libgomp/libgomp-plugin.c +++ b/libgomp/libgomp-plugin.c @@ -82,11 +82,8 @@ GOMP_PLUGIN_fatal (const char *msg, ...) void GOMP_PLUGIN_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr, uint64_t sizes_ptr, uint64_t kinds_ptr, int dev_num, - void (*dev_to_host_cpy) (void *, const void *, size_t, - void *), - void (*host_to_dev_cpy) (void *, const void *, size_t, - void *), void *token) + struct goacc_asyncqueue *aq) { gomp_target_rev (fn_ptr, mapnum, devaddrs_ptr, sizes_ptr, kinds_ptr, dev_num, - dev_to_host_cpy, host_to_dev_cpy, token); + aq); } diff --git a/libgomp/libgomp-plugin.h b/libgomp/libgomp-plugin.h index 66d995f33e8..ca557a79380 100644 --- a/libgomp/libgomp-plugin.h +++ b/libgomp/libgomp-plugin.h @@ -122,11 +122,7 @@ extern void GOMP_PLUGIN_fatal (const char *, ...) __attribute__ ((noreturn, format (printf, 1, 2))); extern void GOMP_PLUGIN_target_rev (uint64_t, uint64_t, uint64_t, uint64_t, -uint64_t, int, -void (*) (void *, const void *, size_t, - void *), -void (*) (void *, const void *, size_t, - void *), void *); +uint64_t, int, struct goacc_asyncqueue *); /* Prototypes for functions implemented by libgomp plugins. */ extern const char *GOMP_OFFLOAD_get_name (void); diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h index 92f6f14960f..3b2b4aa9534 100644 --- a/libgomp/libgomp.h +++ b/libgomp/libgomp.h @@ -1127,10 +1127,7 @@ extern void gomp_init_
libgomp: Simplify OpenMP reverse offload host <-> device memory copy implementation (was: [Patch] libgomp/nvptx: Prepare for reverse-offload callback handling)
Hi! On 2022-08-26T11:07:28+0200, Tobias Burnus wrote: > This patch adds initial [OpenMP reverse offload] support for nvptx. > CUDA does lockup when trying to copy data from the currently running > stream; hence, a new stream is generated to do the memory copying. As part of other work, where I had to touch those special code paths, I found that we may reduce complexity a little bit "by using the existing 'goacc_asyncqueue' instead of re-coding parts of it". OK to push "libgomp: Simplify OpenMP reverse offload host <-> device memory copy implementation" (still testing), see attached? Grüße Thomas - Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht München, HRB 106955 >From 65636e924f69a146e571e7a7009304803e24ca1a Mon Sep 17 00:00:00 2001 From: Thomas Schwinge Date: Tue, 21 Mar 2023 16:14:16 +0100 Subject: [PATCH] libgomp: Simplify OpenMP reverse offload host <-> device memory copy implementation ... by using the existing 'goacc_asyncqueue' instead of re-coding parts of it. Follow-up to commit 131d18e928a3ea1ab2d3bf61aa92d68a8a254609 "libgomp/nvptx: Prepare for reverse-offload callback handling", and commit ea4b23d9c82d9be3b982c3519fe5e8e9d833a6a8 "libgomp: Handle OpenMP's reverse offloads". libgomp/ * target.c (gomp_target_rev): Instead of 'dev_to_host_cpy', 'host_to_dev_cpy', 'token', take a single 'goacc_asyncqueue'. * libgomp.h (gomp_target_rev): Adjust. * libgomp-plugin.c (GOMP_PLUGIN_target_rev): Adjust. * libgomp-plugin.h (GOMP_PLUGIN_target_rev): Adjust. * plugin/plugin-gcn.c (process_reverse_offload): Adjust. * plugin/plugin-nvptx.c (rev_off_dev_to_host_cpy) (rev_off_host_to_dev_cpy): Remove. (GOMP_OFFLOAD_run): Adjust. --- libgomp/libgomp-plugin.c | 7 +-- libgomp/libgomp-plugin.h | 6 +- libgomp/libgomp.h | 5 +- libgomp/plugin/plugin-gcn.c | 2 +- libgomp/plugin/plugin-nvptx.c | 77 ++--- libgomp/target.c | 102 +++--- 6 files changed, 96 insertions(+), 103 deletions(-) diff --git a/libgomp/libgomp-plugin.c b/libgomp/libgomp-plugin.c index 27e7c94ba9b..d696515eeb6 100644 --- a/libgomp/libgomp-plugin.c +++ b/libgomp/libgomp-plugin.c @@ -82,11 +82,8 @@ GOMP_PLUGIN_fatal (const char *msg, ...) void GOMP_PLUGIN_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr, uint64_t sizes_ptr, uint64_t kinds_ptr, int dev_num, - void (*dev_to_host_cpy) (void *, const void *, size_t, - void *), - void (*host_to_dev_cpy) (void *, const void *, size_t, - void *), void *token) + struct goacc_asyncqueue *aq) { gomp_target_rev (fn_ptr, mapnum, devaddrs_ptr, sizes_ptr, kinds_ptr, dev_num, - dev_to_host_cpy, host_to_dev_cpy, token); + aq); } diff --git a/libgomp/libgomp-plugin.h b/libgomp/libgomp-plugin.h index 28267f75f7a..42ee3d6c7f9 100644 --- a/libgomp/libgomp-plugin.h +++ b/libgomp/libgomp-plugin.h @@ -121,11 +121,7 @@ extern void GOMP_PLUGIN_fatal (const char *, ...) __attribute__ ((noreturn, format (printf, 1, 2))); extern void GOMP_PLUGIN_target_rev (uint64_t, uint64_t, uint64_t, uint64_t, -uint64_t, int, -void (*) (void *, const void *, size_t, - void *), -void (*) (void *, const void *, size_t, - void *), void *); +uint64_t, int, struct goacc_asyncqueue *); /* Prototypes for functions implemented by libgomp plugins. */ extern const char *GOMP_OFFLOAD_get_name (void); diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h index ba8fe348aba..4d2bfab4b71 100644 --- a/libgomp/libgomp.h +++ b/libgomp/libgomp.h @@ -1130,10 +1130,7 @@ extern void gomp_init_targets_once (void); extern int gomp_get_num_devices (void); extern bool gomp_target_task_fn (void *); extern void gomp_target_rev (uint64_t, uint64_t, uint64_t, uint64_t, uint64_t, - int, - void (*) (void *, const void *, size_t, void *), - void (*) (void *, const void *, size_t, void *), - void *); + int, struct goacc_asyncqueue *); /* Splay tree definitions. */ typedef struct splay_tree_node_s *splay_tree_node; diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c index 347803762eb..2181bf0235f 100644 --- a/libgomp/plugin/plugin-gcn.c +++ b/libgomp/plugin/plugin-gcn.c @@ -1949,7 +1949,7 @@ process_reverse_offload (uint64_t fn, uint64_t mapnum, uint64_t hostaddrs, { int dev_num = dev_num64; GOMP_PLUGIN_target_rev (fn, mapnum, hostaddrs, sizes, kinds, dev_num, - NULL, NULL, NULL); + NULL); } /* Output any data written to console output from the kernel. It is expected diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c index 5bd5a419e0e..4a710851ee5 100644 --- a/libgomp/plugin/plugin-nvptx.c +++ b/libgomp/plugin/plugin-nvptx.c @@ -56,6 +56,7 @@ #i