Re: [PATCH 2/6, OpenACC, libgomp] Async re-work, oacc-* parts (revised, v4)

2019-01-08 Thread Chung-Lin Tang

On 2019/1/7 10:15 AM, Thomas Schwinge wrote:

Well, the "Properly handle wait clause with no arguments" changes still
need to be completed and go in first (to avoid introducing regressions),
and then I will have to see your whole set of changes that you intend to
commit: the bits you've incrementally posted still don't include several
of the changes I suggested and provided patches for (again, to avoid
introducing regressions).


I'll look at that state again.


But GCC now is in "regression and documentation fixes mode", so I fear
that it's too late now?


Maybe...I don't know.


--- oacc-async.c(revision 267507)
+++ oacc-async.c(working copy)
@@ -62,12 +158,10 @@ acc_wait (int async)
+  goacc_aq aq = lookup_goacc_asyncqueue (thr, true, async);
+  thr->dev->openacc.async.synchronize_func (aq);

Have to check the result here?  Like you're doing here, for example:


  acc_wait_async (int async1, int async2)
  {
+  if (!thr->dev->openacc.async.synchronize_func (aq1))
+gomp_fatal ("wait on %d failed", async1);
+  if (!thr->dev->openacc.async.serialize_func (aq1, aq2))
+gomp_fatal ("ordering of async ids %d and %d failed", async1, async2);
--- oacc-parallel.c (revision 267507)
+++ oacc-parallel.c (working copy)
@@ -521,17 +500,22 @@ goacc_wait (int async, int num_waits, va_list *ap)
if (async == acc_async_sync)
-   acc_wait (qid);
+   acc_dev->openacc.async.synchronize_func (aq);

Likewise?


Oh okay, I forgot about those sites.



Also, I had to apply additional changes as attached, to make this build.



Oh I had those changes, but forgot to update the other patches. I'll resend 
those later too.

Thanks,
Chung-Lin


Re: [PATCH 2/6, OpenACC, libgomp] Async re-work, oacc-* parts (revised, v4)

2019-01-07 Thread Thomas Schwinge
Hi Chung-Lin!

On Sat, 5 Jan 2019 17:47:10 +0800, Chung-Lin Tang  
wrote:
> this is the current version of the oacc-* parts of the Async Re-work patch.
> 
> I have reverted away from the earlier mentioned attempt of using lockless
> techniques to manage the asyncqueues; it is really hard to do in a 100% 
> correct
> manner, unless we only use something like simple lists to manage them,
> which probably makes lookup unacceptably slow.
> 
> For now, I have changed to use the conventional locking and success/fail 
> return
> codes for the synchronize/serialize hooks.

OK, thanks.


> I hope this is enough to pass
> and get committed.

Well, the "Properly handle wait clause with no arguments" changes still
need to be completed and go in first (to avoid introducing regressions),
and then I will have to see your whole set of changes that you intend to
commit: the bits you've incrementally posted still don't include several
of the changes I suggested and provided patches for (again, to avoid
introducing regressions).


But GCC now is in "regression and documentation fixes mode", so I fear
that it's too late now?


> --- oacc-async.c  (revision 267507)
> +++ oacc-async.c  (working copy)

> @@ -62,12 +158,10 @@ acc_wait (int async)

> +  goacc_aq aq = lookup_goacc_asyncqueue (thr, true, async);
> +  thr->dev->openacc.async.synchronize_func (aq);

Have to check the result here?  Like you're doing here, for example:

>  acc_wait_async (int async1, int async2)
>  {

> +  if (!thr->dev->openacc.async.synchronize_func (aq1))
> +gomp_fatal ("wait on %d failed", async1);
> +  if (!thr->dev->openacc.async.serialize_func (aq1, aq2))
> +gomp_fatal ("ordering of async ids %d and %d failed", async1, async2);

> --- oacc-parallel.c   (revision 267507)
> +++ oacc-parallel.c   (working copy)

> @@ -521,17 +500,22 @@ goacc_wait (int async, int num_waits, va_list *ap)

>if (async == acc_async_sync)
> - acc_wait (qid);
> + acc_dev->openacc.async.synchronize_func (aq);

Likewise?

>else if (qid == async)
> - ;/* If we're waiting on the same asynchronous queue as we're
> - launching on, the queue itself will order work as
> - required, so there's no need to wait explicitly.  */
> + /* If we're waiting on the same asynchronous queue as we're
> +launching on, the queue itself will order work as
> +required, so there's no need to wait explicitly.  */
> + ;
>else
> - acc_dev->openacc.async_wait_async_func (qid, async);
> + {
> +   goacc_aq aq2 = get_goacc_asyncqueue (async);
> +   acc_dev->openacc.async.synchronize_func (aq);
> +   acc_dev->openacc.async.serialize_func (aq, aq2);
> + }

Likewise?


Also, I had to apply additional changes as attached, to make this build.


Grüße
 Thomas


>From e4c187a4be46682a989165c38bc6a8d8324554b9 Mon Sep 17 00:00:00 2001
From: Thomas Schwinge 
Date: Mon, 7 Jan 2019 13:25:18 +0100
Subject: [PATCH] [WIP] into async re-work: complete
 GOMP_OFFLOAD_openacc_async_synchronize, GOMP_OFFLOAD_openacc_async_serialize
 interface changes

---
 libgomp/libgomp-plugin.h  |  4 ++--
 libgomp/plugin/plugin-nvptx.c | 29 +
 2 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/libgomp/libgomp-plugin.h b/libgomp/libgomp-plugin.h
index e3c031a282a1..ce3ae125e208 100644
--- a/libgomp/libgomp-plugin.h
+++ b/libgomp/libgomp-plugin.h
@@ -115,8 +115,8 @@ extern void GOMP_OFFLOAD_openacc_destroy_thread_data (void *);
 extern struct goacc_asyncqueue *GOMP_OFFLOAD_openacc_async_construct (void);
 extern bool GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *);
 extern int GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *);
-extern void GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *);
-extern void GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *,
+extern bool GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *);
+extern bool GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *,
 		  struct goacc_asyncqueue *);
 extern void GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *,
 		   void (*)(void *), void *);
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index f42cbf488a79..12f87ba7be4d 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -1395,22 +1395,35 @@ GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
   return -1;
 }
 
-void
+bool
 GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
 {
-  //TODO Is this safe to call, or might this cause deadlock if something's locked?
-  CUDA_CALL_ASSERT (cuStreamSynchronize, aq->cuda_stream);
+  CUresult r = CUDA_CALL_NOCHECK (cuStreamSynchronize, aq->cuda_stream);
+  return r == CUDA_SUCCESS;
 }
 
-void
+bool
 GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
   struct goacc_asyncqueue *aq2)
 {
+  CUresult r;
   

Re: [PATCH 2/6, OpenACC, libgomp] Async re-work, oacc-* parts (revised, v4)

2019-01-05 Thread Chung-Lin Tang

Hi Thomas,
this is the current version of the oacc-* parts of the Async Re-work patch.

I have reverted away from the earlier mentioned attempt of using lockless
techniques to manage the asyncqueues; it is really hard to do in a 100% correct
manner, unless we only use something like simple lists to manage them,
which probably makes lookup unacceptably slow.

For now, I have changed to use the conventional locking and success/fail return
codes for the synchronize/serialize hooks. I hope this is enough to pass
and get committed.

Thanks,
Chung-Lin

Index: oacc-async.c
===
--- oacc-async.c(revision 267507)
+++ oacc-async.c(working copy)
@@ -27,10 +27,99 @@
.  */
 
 #include 
+#include 
 #include "openacc.h"
 #include "libgomp.h"
 #include "oacc-int.h"
 
+static struct goacc_thread *
+get_goacc_thread (void)
+{
+  struct goacc_thread *thr = goacc_thread ();
+
+  if (!thr || !thr->dev)
+gomp_fatal ("no device active");
+
+  return thr;
+}
+
+static struct gomp_device_descr *
+get_goacc_thread_device (void)
+{
+  struct goacc_thread *thr = goacc_thread ();
+
+  if (!thr || !thr->dev)
+gomp_fatal ("no device active");
+
+  return thr->dev;
+}
+
+attribute_hidden struct goacc_asyncqueue *
+lookup_goacc_asyncqueue (struct goacc_thread *thr, bool create, int async)
+{
+  /* The special value acc_async_noval (-1) maps to the thread-specific
+ default async stream.  */
+  if (async == acc_async_noval)
+async = thr->default_async;
+
+  if (async == acc_async_sync)
+return NULL;
+
+  if (async < 0)
+gomp_fatal ("bad async %d", async);
+
+  struct goacc_asyncqueue *ret_aq = NULL;
+  struct gomp_device_descr *dev = thr->dev;
+
+  gomp_mutex_lock (>openacc.async.lock);
+
+  if (!create
+  && (async >= dev->openacc.async.nasyncqueue
+ || !dev->openacc.async.asyncqueue[async]))
+goto end;
+
+  if (async >= dev->openacc.async.nasyncqueue)
+{
+  int diff = async + 1 - dev->openacc.async.nasyncqueue;
+  dev->openacc.async.asyncqueue
+   = gomp_realloc (dev->openacc.async.asyncqueue,
+   sizeof (goacc_aq) * (async + 1));
+  memset (dev->openacc.async.asyncqueue + dev->openacc.async.nasyncqueue,
+ 0, sizeof (goacc_aq) * diff);
+  dev->openacc.async.nasyncqueue = async + 1;
+}
+
+  if (!dev->openacc.async.asyncqueue[async])
+{
+  dev->openacc.async.asyncqueue[async] = dev->openacc.async.construct_func 
();
+
+  if (!dev->openacc.async.asyncqueue[async])
+   {
+ gomp_mutex_unlock (>openacc.async.lock);
+ gomp_fatal ("async %d creation failed", async);
+   }
+  
+  /* Link new async queue into active list.  */
+  goacc_aq_list n = gomp_malloc (sizeof (struct goacc_asyncqueue_list));
+  n->aq = dev->openacc.async.asyncqueue[async];
+  n->next = dev->openacc.async.active;
+  dev->openacc.async.active = n;
+}
+
+  ret_aq = dev->openacc.async.asyncqueue[async];
+
+ end:
+  gomp_mutex_unlock (>openacc.async.lock);
+  return ret_aq;
+}
+
+attribute_hidden struct goacc_asyncqueue *
+get_goacc_asyncqueue (int async)
+{
+  struct goacc_thread *thr = get_goacc_thread ();
+  return lookup_goacc_asyncqueue (thr, true, async);
+}
+
 int
 acc_async_test (int async)
 {
@@ -42,18 +131,25 @@ acc_async_test (int async)
   if (!thr || !thr->dev)
 gomp_fatal ("no device active");
 
-  return thr->dev->openacc.async_test_func (async);
+  goacc_aq aq = lookup_goacc_asyncqueue (thr, true, async);
+  return thr->dev->openacc.async.test_func (aq);
 }
 
 int
 acc_async_test_all (void)
 {
-  struct goacc_thread *thr = goacc_thread ();
+  struct goacc_thread *thr = get_goacc_thread ();
 
-  if (!thr || !thr->dev)
-gomp_fatal ("no device active");
-
-  return thr->dev->openacc.async_test_all_func ();
+  int ret = 1;
+  gomp_mutex_lock (>dev->openacc.async.lock);
+  for (goacc_aq_list l = thr->dev->openacc.async.active; l; l = l->next)
+if (!thr->dev->openacc.async.test_func (l->aq))
+  {
+   ret = 0;
+   break;
+  }
+  gomp_mutex_unlock (>dev->openacc.async.lock);
+  return ret;
 }
 
 void
@@ -62,12 +158,10 @@ acc_wait (int async)
   if (!async_valid_p (async))
 gomp_fatal ("invalid async argument: %d", async);
 
-  struct goacc_thread *thr = goacc_thread ();
+  struct goacc_thread *thr = get_goacc_thread ();
 
-  if (!thr || !thr->dev)
-gomp_fatal ("no device active");
-
-  thr->dev->openacc.async_wait_func (async);
+  goacc_aq aq = lookup_goacc_asyncqueue (thr, true, async);
+  thr->dev->openacc.async.synchronize_func (aq);
 }
 
 /* acc_async_wait is an OpenACC 1.0 compatibility name for acc_wait.  */
@@ -84,23 +178,34 @@ acc_async_wait (int async)
 void
 acc_wait_async (int async1, int async2)
 {
-  struct goacc_thread *thr = goacc_thread ();
+  struct goacc_thread *thr = get_goacc_thread ();
 
-  if (!thr || !thr->dev)
-gomp_fatal ("no device