Re: [PATCH] libgomp: Fix hang when profiling OpenACC programs with CUDA 9.0 nvprof

2020-07-14 Thread Thomas Schwinge
init_start event callback, which is dispatched during
> +   acc_init_1.  Trying to lock acc_device_lock during such a call (as we 
> do
> +   in the else clause below), will result in deadlock, since the lock has
> +   already been taken by the acc_init_1 caller.  We work around this 
> problem
> +   by using the acc_get_device_type property "If the device type has not 
> yet
> +   been selected, the value acc_device_none may be returned".  */
> +;
>else
>  {
>acc_prof_info prof_info;
> diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-cb-call.c 
> b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-cb-call.c
> new file mode 100644
> index 000..6114164
> --- /dev/null
> +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-cb-call.c
> @@ -0,0 +1,39 @@
> +/* { dg-do run } */
> +/* { dg-timeout 10 } */
> +
> +/* Test the calling of acc_get_device_type() from within the 
> device_init_start
> +   callback.  This occurs when the CUDA 9.0 nvprof tool is used, and can
> +   deadlock if not handled properly.  */
> +
> +#include 
> +
> +static acc_prof_reg reg;
> +static acc_prof_reg unreg;
> +static acc_prof_lookup_func lookup;
> +
> +void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, 
> acc_prof_lookup_func lookup_)
> +{
> +  reg = reg_;
> +  unreg = unreg_;
> +  lookup = lookup_;
> +}
> +
> +static acc_device_t acc_device_type;
> +
> +static void cb_device_init_start (acc_prof_info *prof_info, acc_event_info 
> *event_info, acc_api_info *api_info)
> +{
> +  acc_device_type = acc_get_device_type ();
> +}
> +
> +int main(void)
> +{
> +  acc_register_library (acc_prof_register, acc_prof_unregister, 
> acc_prof_lookup);
> +
> +  reg (acc_ev_device_init_start, cb_device_init_start, acc_reg);
> +
> +  acc_init (acc_device_host);
> +  acc_shutdown (acc_device_host);
> +
> +  acc_init (acc_device_default);
> +  acc_shutdown (acc_device_default);
> +}


-
Mentor Graphics (Deutschland) GmbH, Arnulfstraße 201, 80634 München / Germany
Registergericht München HRB 106955, Geschäftsführer: Thomas Heurung, Alexander 
Walter
>From 82e43a3263068e006dc96f9bf0ace033e45038ef Mon Sep 17 00:00:00 2001
From: Thomas Schwinge 
Date: Tue, 14 Jul 2020 12:43:53 +0200
Subject: [PATCH] into "libgomp: Fix hang when profiling OpenACC programs with
 CUDA 9.0 nvprof"

---
 libgomp/libgomp.texi  | 11 +++
 .../acc_prof-cb-call.c| 39 -
 .../acc_prof-init-2.c | 80 +++
 3 files changed, 91 insertions(+), 39 deletions(-)
 delete mode 100644 libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-cb-call.c
 create mode 100644 libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-init-2.c

diff --git a/libgomp/libgomp.texi b/libgomp/libgomp.texi
index b946743f9b1..5331230c207 100644
--- a/libgomp/libgomp.texi
+++ b/libgomp/libgomp.texi
@@ -1967,6 +1967,12 @@ in @var{devicetype}, to use when executing a parallel or kernels region.
 This function returns what device type will be used when executing a
 parallel or kernels region.
 
+This function returns @code{acc_device_none} if
+@code{acc_get_device_type} is called from
+@code{acc_ev_device_init_start}, @code{acc_ev_device_init_end}
+callbacks of the OpenACC Profiling Interface (@ref{OpenACC Profiling
+Interface}), that is, if the device is currently being initialized.
+
 @item @emph{C/C++}:
 @multitable @columnfractions .20 .80
 @item @emph{Prototype}: @tab @code{acc_device_t acc_get_device_type(void);}
@@ -3382,6 +3388,11 @@ every event that has been registered.
 
 We're not yet accounting for the fact that @cite{OpenACC events may
 occur during event processing}.
+We just handle one case specially, as required by CUDA 9.0
+@command{nvprof}, that @code{acc_get_device_type}
+(@ref{acc_get_device_type})) may be called from
+@code{acc_ev_device_init_start}, @code{acc_ev_device_init_end}
+callbacks.
 
 We're not yet implementing initialization via a
 @code{acc_register_library} function that is either statically linked
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-cb-call.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-cb-call.c
deleted file mode 100644
index 6114164aa24..000
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-cb-call.c
+++ /dev/null
@@ -1,39 +0,0 @@
-/* { dg-do run } */
-/* { dg-timeout 10 } */
-
-/* Test the calling of acc_get_device_type() from within the device_init_start
-   callback.  This occurs when the CUDA 9.0 nvprof tool is used, and can
-   deadlock if not handled properly.  */
-
-#include 
-
-static acc_prof_reg reg;
-static acc_prof_reg unreg;
-static acc_prof_lookup_func lookup;
-
-void acc_register_library (acc_prof_reg reg_, acc

[PATCH] libgomp: Fix hang when profiling OpenACC programs with CUDA 9.0 nvprof

2020-07-13 Thread Kwok Cheung Yeung

Hello

(This patch was previously posted for OG7 at: 
https://gcc.gnu.org/pipermail/gcc-patches/2018-February/494594.html).


When the version of nvprof in CUDA 9.0 is run on an OpenACC program, it sets up 
a callback that is called on device initialization. Inside the callback, it 
calls the acc_get_device_type() function in libgomp. Inside 
acc_get_device_type(), it attempts to acquire the acc_device_lock lock, but that 
has already been acquired by goacc_lazy_initialize() at this point, so the 
program deadlocks.


This is fixed by making acc_get_device_type() return acc_device_none without 
attempting to acequire the lock when initialization has not finished yet. This 
appears to be legal according to the OpenACC spec, since it states 'If the 
device type has not yet been selected, the value acc_device_none may be 
returned.' (in section 3.2.3 of the OpenACC 2.7 spec).


I have added a testcase that sets up the situation presented by nvprof. This 
testcase hangs without the patch (hence the short dg-timeout), and passes with 
it. Tested on a x86-64 host with Nvidia and AMD GCN offloading.


Okay for master, GCC 10 branch and OG10?

Thanks

Kwok
commit d20f269e8571a76d682a500e78654ddd260ffaf1
Author: Kwok Cheung Yeung 
Date:   Fri Jul 10 14:06:26 2020 -0700

libgomp: Fix hang when profiling OpenACC programs with CUDA 9.0 nvprof

The version of nvprof in CUDA 9.0 causes a hang when used to profile an
OpenACC program.  This is because it calls acc_get_device_type from
a callback called during device initialization, which then attempts
to acquire acc_device_lock while it is already taken, resulting in
deadlock.  This works around the issue by returning acc_device_none
from acc_get_device_type without attempting to acquire the lock when
initialization has not completed yet.

2020-07-13  Tom de Vries  
Cesar Philippidis  
Thomas Schwinge  
Kwok Cheung Yeung  

libgomp/
* oacc-init.c (acc_init_state_lock, acc_init_state, acc_init_thread):
New variable.
(acc_init_1): Set acc_init_thread to pthread_self ().  Set
acc_init_state to initializing at the start, and to initialized at the
end.
(self_initializing_p): New function.
(acc_get_device_type): Return acc_device_none if called by thread that
is currently executing acc_init_1.
* testsuite/libgomp.oacc-c-c++-common/acc_prof-cb-call.c: New.

diff --git a/libgomp/oacc-init.c b/libgomp/oacc-init.c
index 5d786a5..1e7f934 100644
--- a/libgomp/oacc-init.c
+++ b/libgomp/oacc-init.c
@@ -40,6 +40,11 @@
 
 static gomp_mutex_t acc_device_lock;
 
+static gomp_mutex_t acc_init_state_lock;
+static enum { uninitialized, initializing, initialized } acc_init_state
+  = uninitialized;
+static pthread_t acc_init_thread;
+
 /* A cached version of the dispatcher for the global "current" accelerator 
type,
e.g. used as the default when creating new host threads.  This is the
device-type equivalent of goacc_device_num (which specifies which device to
@@ -228,6 +233,11 @@ acc_dev_num_out_of_range (acc_device_t d, int ord, int 
ndevs)
 static struct gomp_device_descr *
 acc_init_1 (acc_device_t d, acc_construct_t parent_construct, int implicit)
 {
+  gomp_mutex_lock (_init_state_lock);
+  acc_init_state = initializing;
+  acc_init_thread = pthread_self ();
+  gomp_mutex_unlock (_init_state_lock);
+
   bool check_not_nested_p;
   if (implicit)
 {
@@ -317,6 +327,14 @@ acc_init_1 (acc_device_t d, acc_construct_t 
parent_construct, int implicit)
_info);
 }
 
+  /* We're setting 'initialized' *after* 'goacc_profiling_dispatch', so that a
+ nested 'acc_get_device_type' called from a profiling callback still sees
+ 'initializing', so that we don't deadlock when it then again tries to lock
+ 'goacc_prof_lock'.  See also the discussion in 'acc_get_device_type'.  */
+  gomp_mutex_lock (_init_state_lock);
+  acc_init_state = initialized;
+  gomp_mutex_unlock (_init_state_lock);
+
   return base_dev;
 }
 
@@ -643,6 +661,17 @@ acc_set_device_type (acc_device_t d)
 
 ialias (acc_set_device_type)
 
+static bool
+self_initializing_p (void)
+{
+  bool res;
+  gomp_mutex_lock (_init_state_lock);
+  res = (acc_init_state == initializing
+&& pthread_equal (acc_init_thread, pthread_self ()));
+  gomp_mutex_unlock (_init_state_lock);
+  return res;
+}
+
 acc_device_t
 acc_get_device_type (void)
 {
@@ -652,6 +681,15 @@ acc_get_device_type (void)
 
   if (thr && thr->base_dev)
 res = acc_device_type (thr->base_dev->type);
+  else if (self_initializing_p ())
+/* The Cuda libaccinj64.so version 9.0+ calls acc_get_device_type during 
the
+   acc_ev_device_init_start event callback, which is dispatched during
+   acc_init_1.  Trying to lock acc_device_lock during such a call (as we do
+   in the else clause below), will result in deadlock, since the lock has
+