Re: [PATCH] libgomp: Fix hang when profiling OpenACC programs with CUDA 9.0 nvprof
init_start event callback, which is dispatched during > + acc_init_1. Trying to lock acc_device_lock during such a call (as we > do > + in the else clause below), will result in deadlock, since the lock has > + already been taken by the acc_init_1 caller. We work around this > problem > + by using the acc_get_device_type property "If the device type has not > yet > + been selected, the value acc_device_none may be returned". */ > +; >else > { >acc_prof_info prof_info; > diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-cb-call.c > b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-cb-call.c > new file mode 100644 > index 000..6114164 > --- /dev/null > +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-cb-call.c > @@ -0,0 +1,39 @@ > +/* { dg-do run } */ > +/* { dg-timeout 10 } */ > + > +/* Test the calling of acc_get_device_type() from within the > device_init_start > + callback. This occurs when the CUDA 9.0 nvprof tool is used, and can > + deadlock if not handled properly. */ > + > +#include > + > +static acc_prof_reg reg; > +static acc_prof_reg unreg; > +static acc_prof_lookup_func lookup; > + > +void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, > acc_prof_lookup_func lookup_) > +{ > + reg = reg_; > + unreg = unreg_; > + lookup = lookup_; > +} > + > +static acc_device_t acc_device_type; > + > +static void cb_device_init_start (acc_prof_info *prof_info, acc_event_info > *event_info, acc_api_info *api_info) > +{ > + acc_device_type = acc_get_device_type (); > +} > + > +int main(void) > +{ > + acc_register_library (acc_prof_register, acc_prof_unregister, > acc_prof_lookup); > + > + reg (acc_ev_device_init_start, cb_device_init_start, acc_reg); > + > + acc_init (acc_device_host); > + acc_shutdown (acc_device_host); > + > + acc_init (acc_device_default); > + acc_shutdown (acc_device_default); > +} - Mentor Graphics (Deutschland) GmbH, Arnulfstraße 201, 80634 München / Germany Registergericht München HRB 106955, Geschäftsführer: Thomas Heurung, Alexander Walter >From 82e43a3263068e006dc96f9bf0ace033e45038ef Mon Sep 17 00:00:00 2001 From: Thomas Schwinge Date: Tue, 14 Jul 2020 12:43:53 +0200 Subject: [PATCH] into "libgomp: Fix hang when profiling OpenACC programs with CUDA 9.0 nvprof" --- libgomp/libgomp.texi | 11 +++ .../acc_prof-cb-call.c| 39 - .../acc_prof-init-2.c | 80 +++ 3 files changed, 91 insertions(+), 39 deletions(-) delete mode 100644 libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-cb-call.c create mode 100644 libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-init-2.c diff --git a/libgomp/libgomp.texi b/libgomp/libgomp.texi index b946743f9b1..5331230c207 100644 --- a/libgomp/libgomp.texi +++ b/libgomp/libgomp.texi @@ -1967,6 +1967,12 @@ in @var{devicetype}, to use when executing a parallel or kernels region. This function returns what device type will be used when executing a parallel or kernels region. +This function returns @code{acc_device_none} if +@code{acc_get_device_type} is called from +@code{acc_ev_device_init_start}, @code{acc_ev_device_init_end} +callbacks of the OpenACC Profiling Interface (@ref{OpenACC Profiling +Interface}), that is, if the device is currently being initialized. + @item @emph{C/C++}: @multitable @columnfractions .20 .80 @item @emph{Prototype}: @tab @code{acc_device_t acc_get_device_type(void);} @@ -3382,6 +3388,11 @@ every event that has been registered. We're not yet accounting for the fact that @cite{OpenACC events may occur during event processing}. +We just handle one case specially, as required by CUDA 9.0 +@command{nvprof}, that @code{acc_get_device_type} +(@ref{acc_get_device_type})) may be called from +@code{acc_ev_device_init_start}, @code{acc_ev_device_init_end} +callbacks. We're not yet implementing initialization via a @code{acc_register_library} function that is either statically linked diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-cb-call.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-cb-call.c deleted file mode 100644 index 6114164aa24..000 --- a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-cb-call.c +++ /dev/null @@ -1,39 +0,0 @@ -/* { dg-do run } */ -/* { dg-timeout 10 } */ - -/* Test the calling of acc_get_device_type() from within the device_init_start - callback. This occurs when the CUDA 9.0 nvprof tool is used, and can - deadlock if not handled properly. */ - -#include - -static acc_prof_reg reg; -static acc_prof_reg unreg; -static acc_prof_lookup_func lookup; - -void acc_register_library (acc_prof_reg reg_, acc
[PATCH] libgomp: Fix hang when profiling OpenACC programs with CUDA 9.0 nvprof
Hello (This patch was previously posted for OG7 at: https://gcc.gnu.org/pipermail/gcc-patches/2018-February/494594.html). When the version of nvprof in CUDA 9.0 is run on an OpenACC program, it sets up a callback that is called on device initialization. Inside the callback, it calls the acc_get_device_type() function in libgomp. Inside acc_get_device_type(), it attempts to acquire the acc_device_lock lock, but that has already been acquired by goacc_lazy_initialize() at this point, so the program deadlocks. This is fixed by making acc_get_device_type() return acc_device_none without attempting to acequire the lock when initialization has not finished yet. This appears to be legal according to the OpenACC spec, since it states 'If the device type has not yet been selected, the value acc_device_none may be returned.' (in section 3.2.3 of the OpenACC 2.7 spec). I have added a testcase that sets up the situation presented by nvprof. This testcase hangs without the patch (hence the short dg-timeout), and passes with it. Tested on a x86-64 host with Nvidia and AMD GCN offloading. Okay for master, GCC 10 branch and OG10? Thanks Kwok commit d20f269e8571a76d682a500e78654ddd260ffaf1 Author: Kwok Cheung Yeung Date: Fri Jul 10 14:06:26 2020 -0700 libgomp: Fix hang when profiling OpenACC programs with CUDA 9.0 nvprof The version of nvprof in CUDA 9.0 causes a hang when used to profile an OpenACC program. This is because it calls acc_get_device_type from a callback called during device initialization, which then attempts to acquire acc_device_lock while it is already taken, resulting in deadlock. This works around the issue by returning acc_device_none from acc_get_device_type without attempting to acquire the lock when initialization has not completed yet. 2020-07-13 Tom de Vries Cesar Philippidis Thomas Schwinge Kwok Cheung Yeung libgomp/ * oacc-init.c (acc_init_state_lock, acc_init_state, acc_init_thread): New variable. (acc_init_1): Set acc_init_thread to pthread_self (). Set acc_init_state to initializing at the start, and to initialized at the end. (self_initializing_p): New function. (acc_get_device_type): Return acc_device_none if called by thread that is currently executing acc_init_1. * testsuite/libgomp.oacc-c-c++-common/acc_prof-cb-call.c: New. diff --git a/libgomp/oacc-init.c b/libgomp/oacc-init.c index 5d786a5..1e7f934 100644 --- a/libgomp/oacc-init.c +++ b/libgomp/oacc-init.c @@ -40,6 +40,11 @@ static gomp_mutex_t acc_device_lock; +static gomp_mutex_t acc_init_state_lock; +static enum { uninitialized, initializing, initialized } acc_init_state + = uninitialized; +static pthread_t acc_init_thread; + /* A cached version of the dispatcher for the global "current" accelerator type, e.g. used as the default when creating new host threads. This is the device-type equivalent of goacc_device_num (which specifies which device to @@ -228,6 +233,11 @@ acc_dev_num_out_of_range (acc_device_t d, int ord, int ndevs) static struct gomp_device_descr * acc_init_1 (acc_device_t d, acc_construct_t parent_construct, int implicit) { + gomp_mutex_lock (_init_state_lock); + acc_init_state = initializing; + acc_init_thread = pthread_self (); + gomp_mutex_unlock (_init_state_lock); + bool check_not_nested_p; if (implicit) { @@ -317,6 +327,14 @@ acc_init_1 (acc_device_t d, acc_construct_t parent_construct, int implicit) _info); } + /* We're setting 'initialized' *after* 'goacc_profiling_dispatch', so that a + nested 'acc_get_device_type' called from a profiling callback still sees + 'initializing', so that we don't deadlock when it then again tries to lock + 'goacc_prof_lock'. See also the discussion in 'acc_get_device_type'. */ + gomp_mutex_lock (_init_state_lock); + acc_init_state = initialized; + gomp_mutex_unlock (_init_state_lock); + return base_dev; } @@ -643,6 +661,17 @@ acc_set_device_type (acc_device_t d) ialias (acc_set_device_type) +static bool +self_initializing_p (void) +{ + bool res; + gomp_mutex_lock (_init_state_lock); + res = (acc_init_state == initializing +&& pthread_equal (acc_init_thread, pthread_self ())); + gomp_mutex_unlock (_init_state_lock); + return res; +} + acc_device_t acc_get_device_type (void) { @@ -652,6 +681,15 @@ acc_get_device_type (void) if (thr && thr->base_dev) res = acc_device_type (thr->base_dev->type); + else if (self_initializing_p ()) +/* The Cuda libaccinj64.so version 9.0+ calls acc_get_device_type during the + acc_ev_device_init_start event callback, which is dispatched during + acc_init_1. Trying to lock acc_device_lock during such a call (as we do + in the else clause below), will result in deadlock, since the lock has +