Hi,

this patch adds the following builtins in C/C++:
- __builtin_goacc_gang_id
- __builtin_goacc_worker_id
- __builtin_goacc_vector_id
- __builtin_goacc_gang_size
- __builtin_goacc_worker_size
- __builtin_goacc_vector_size


We have openacc C/C++ test-cases using the following nvptx idiom:
...
      __asm__ volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (g));
      __asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (w));
      __asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (v));
...

Typically these asm insns are guarded with acc_on_device (acc_device_nvidia), and skipping -O0:
...
/* This code uses nvptx inline assembly guarded with acc_on_device,
   which is not optimized away at -O0, and then confuses the target
   assembler.
   { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */
...

This patch replaces those asm statements with the corresponding new builtins, allowing the test-cases to work for all offloading targets, not just nvptx, and also at O0. [ Note that the proposed fix for PR82391 at https://gcc.gnu.org/ml/gcc-patches/2017-12/msg01529.html also addresses the O0 problem. ]

Bootstrapped and reg-tested on x86_64.
Build and reg-tested on x86_64 with nvptx accelerator.

OK for trunk?

Thanks,
- Tom
Add __builtin_goacc_{gang,worker,vector}_{id,size}

2017-12-30  Tom de Vries  <t...@codesourcery.com>

	PR libgomp/82428
	* builtins.def (DEF_GOACC_BUILTIN_ONLY): Define.
	* omp-builtins.def (BUILT_IN_GOACC_GANG_ID, BUILT_IN_GOACC_GANG_SIZE)
	(BUILT_IN_GOACC_WORKER_ID, BUILT_IN_GOACC_WORKER_SIZE
	(BUILT_IN_GOACC_VECTOR_ID, BUILT_IN_GOACC_VECTOR_SIZE): New builtin.
	* omp-offload.c (oacc_dim_call_1): Factor out of ...
	(oacc_dim_call): ... here.
	(fold_goacc_dim_id_size): New function.
	(execute_oacc_device_lower): Use fold_goacc_dim_id_size.
	* internal-fn.c (expand_GOACC_DIM_SIZE, expand_GOACC_DIM_POS): Handle
	MEM_P lhs.
	* omp-general.c (oacc_get_fn_dim_size): Handle routines.

	* f95-lang.c (DEF_GOACC_BUILTIN_ONLY): Define.

	* testsuite/libgomp.oacc-c-c++-common/gang-static-2.c: Use
	__builtin_goacc_{gang,worker,vector}_{id,size}.
	* testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c: Same.
	* testsuite/libgomp.oacc-c-c++-common/loop-dim-default.c: Same.
	* testsuite/libgomp.oacc-c-c++-common/loop-g-1.c: Same.
	* testsuite/libgomp.oacc-c-c++-common/loop-g-2.c: Same.
	* testsuite/libgomp.oacc-c-c++-common/loop-gwv-1.c: Same.
	* testsuite/libgomp.oacc-c-c++-common/loop-red-g-1.c: Same.
	* testsuite/libgomp.oacc-c-c++-common/loop-red-gwv-1.c: Same.
	* testsuite/libgomp.oacc-c-c++-common/loop-red-v-1.c: Same.
	* testsuite/libgomp.oacc-c-c++-common/loop-red-v-2.c: Same.
	* testsuite/libgomp.oacc-c-c++-common/loop-red-w-1.c: Same.
	* testsuite/libgomp.oacc-c-c++-common/loop-red-w-2.c: Same.
	* testsuite/libgomp.oacc-c-c++-common/loop-red-wv-1.c: Same.
	* testsuite/libgomp.oacc-c-c++-common/loop-v-1.c: Same.
	* testsuite/libgomp.oacc-c-c++-common/loop-w-1.c: Same.
	* testsuite/libgomp.oacc-c-c++-common/loop-wv-1.c: Same.
	* testsuite/libgomp.oacc-c-c++-common/parallel-dims.c: Same.
	* testsuite/libgomp.oacc-c-c++-common/routine-g-1.c: Same.
	* testsuite/libgomp.oacc-c-c++-common/routine-gwv-1.c: Same.
	* testsuite/libgomp.oacc-c-c++-common/routine-v-1.c: Same.
	* testsuite/libgomp.oacc-c-c++-common/routine-w-1.c: Same.
	* testsuite/libgomp.oacc-c-c++-common/routine-wv-1.c: Same.
	* testsuite/libgomp.oacc-c-c++-common/routine-wv-2.c: Same.
	* testsuite/libgomp.oacc-c-c++-common/tile-1.c: Same.

---
 gcc/builtins.def                                   |  4 +
 gcc/fortran/f95-lang.c                             |  4 +
 gcc/internal-fn.c                                  | 10 ++-
 gcc/omp-builtins.def                               | 13 ++++
 gcc/omp-general.c                                  |  3 +
 gcc/omp-offload.c                                  | 90 ++++++++++++++++++++--
 .../libgomp.oacc-c-c++-common/gang-static-2.c      | 20 ++---
 .../libgomp.oacc-c-c++-common/loop-auto-1.c        | 17 ++--
 .../libgomp.oacc-c-c++-common/loop-dim-default.c   | 13 ++--
 .../testsuite/libgomp.oacc-c-c++-common/loop-g-1.c | 16 ++--
 .../testsuite/libgomp.oacc-c-c++-common/loop-g-2.c | 15 ++--
 .../libgomp.oacc-c-c++-common/loop-gwv-1.c         | 16 ++--
 .../libgomp.oacc-c-c++-common/loop-red-g-1.c       | 15 ++--
 .../libgomp.oacc-c-c++-common/loop-red-gwv-1.c     | 15 ++--
 .../libgomp.oacc-c-c++-common/loop-red-v-1.c       | 15 ++--
 .../libgomp.oacc-c-c++-common/loop-red-v-2.c       | 15 ++--
 .../libgomp.oacc-c-c++-common/loop-red-w-1.c       | 15 ++--
 .../libgomp.oacc-c-c++-common/loop-red-w-2.c       | 15 ++--
 .../libgomp.oacc-c-c++-common/loop-red-wv-1.c      | 11 +--
 .../testsuite/libgomp.oacc-c-c++-common/loop-v-1.c | 15 ++--
 .../testsuite/libgomp.oacc-c-c++-common/loop-w-1.c | 15 ++--
 .../libgomp.oacc-c-c++-common/loop-wv-1.c          | 15 ++--
 .../libgomp.oacc-c-c++-common/parallel-dims.c      | 18 +----
 .../libgomp.oacc-c-c++-common/routine-g-1.c        | 17 ++--
 .../libgomp.oacc-c-c++-common/routine-gwv-1.c      | 17 ++--
 .../libgomp.oacc-c-c++-common/routine-v-1.c        | 17 ++--
 .../libgomp.oacc-c-c++-common/routine-w-1.c        | 17 ++--
 .../libgomp.oacc-c-c++-common/routine-wv-1.c       | 17 ++--
 .../libgomp.oacc-c-c++-common/routine-wv-2.c       | 18 ++---
 .../testsuite/libgomp.oacc-c-c++-common/tile-1.c   | 14 ++--
 30 files changed, 263 insertions(+), 239 deletions(-)

diff --git a/gcc/builtins.def b/gcc/builtins.def
index 671097e..263dfc7ea 100644
--- a/gcc/builtins.def
+++ b/gcc/builtins.def
@@ -214,6 +214,10 @@ along with GCC; see the file COPYING3.  If not see
 #define DEF_GOACC_BUILTIN_COMPILER(ENUM, NAME, TYPE, ATTRS) \
   DEF_BUILTIN (ENUM, "__builtin_" NAME, BUILT_IN_NORMAL, TYPE, TYPE,    \
 	       flag_openacc, true, true, ATTRS, false, true)
+#undef DEF_GOACC_BUILTIN_ONLY
+#define DEF_GOACC_BUILTIN_ONLY(ENUM, NAME, TYPE, ATTRS) \
+  DEF_BUILTIN (ENUM, "__builtin_" NAME, BUILT_IN_NORMAL, TYPE, BT_LAST,    \
+	       false, false, true, ATTRS, false, flag_openacc)
 #undef DEF_GOMP_BUILTIN
 #define DEF_GOMP_BUILTIN(ENUM, NAME, TYPE, ATTRS) \
   DEF_BUILTIN (ENUM, "__builtin_" NAME, BUILT_IN_NORMAL, TYPE, TYPE,    \
diff --git a/gcc/fortran/f95-lang.c b/gcc/fortran/f95-lang.c
index dc9a1ae..60a28b8 100644
--- a/gcc/fortran/f95-lang.c
+++ b/gcc/fortran/f95-lang.c
@@ -1202,6 +1202,10 @@ gfc_init_builtin_functions (void)
 #undef DEF_GOACC_BUILTIN_COMPILER
 #define DEF_GOACC_BUILTIN_COMPILER(code, name, type, attr) \
       gfc_define_builtin (name, builtin_types[type], code, name, attr);
+#undef DEF_GOACC_BUILTIN_ONLY
+#define DEF_GOACC_BUILTIN_ONLY(code, name, type, attr) \
+      gfc_define_builtin ("__builtin_" name, builtin_types[type], code, NULL, \
+			  attr);
 #undef DEF_GOMP_BUILTIN
 #define DEF_GOMP_BUILTIN(code, name, type, attr) /* ignore */
 #include "../omp-builtins.def"
diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c
index 21e7b10..abb6780 100644
--- a/gcc/internal-fn.c
+++ b/gcc/internal-fn.c
@@ -2561,7 +2561,10 @@ expand_GOACC_DIM_SIZE (internal_fn, gcall *stmt)
     {
       rtx dim = expand_expr (gimple_call_arg (stmt, 0), NULL_RTX,
 			     VOIDmode, EXPAND_NORMAL);
-      emit_insn (targetm.gen_oacc_dim_size (target, dim));
+      rtx reg = MEM_P (target) ? gen_reg_rtx (GET_MODE (target)) : target;
+      emit_insn (targetm.gen_oacc_dim_size (reg, dim));
+      if (reg != target)
+	emit_move_insn (target, reg);
     }
   else
     emit_move_insn (target, GEN_INT (1));
@@ -2582,7 +2585,10 @@ expand_GOACC_DIM_POS (internal_fn, gcall *stmt)
     {
       rtx dim = expand_expr (gimple_call_arg (stmt, 0), NULL_RTX,
 			     VOIDmode, EXPAND_NORMAL);
-      emit_insn (targetm.gen_oacc_dim_pos (target, dim));
+      rtx reg = MEM_P (target) ? gen_reg_rtx (GET_MODE (target)) : target;
+      emit_insn (targetm.gen_oacc_dim_pos (reg, dim));
+      if (reg != target)
+	emit_move_insn (target, reg);
     }
   else
     emit_move_insn (target, const0_rtx);
diff --git a/gcc/omp-builtins.def b/gcc/omp-builtins.def
index 69b73f4..6675f82 100644
--- a/gcc/omp-builtins.def
+++ b/gcc/omp-builtins.def
@@ -51,6 +51,19 @@ DEF_GOACC_BUILTIN (BUILT_IN_GOACC_WAIT, "GOACC_wait",
 DEF_GOACC_BUILTIN_COMPILER (BUILT_IN_ACC_ON_DEVICE, "acc_on_device",
 			    BT_FN_INT_INT, ATTR_CONST_NOTHROW_LEAF_LIST)
 
+DEF_GOACC_BUILTIN_ONLY (BUILT_IN_GOACC_GANG_ID, "goacc_gang_id",
+			BT_FN_INT, ATTR_NOTHROW_LEAF_LIST)
+DEF_GOACC_BUILTIN_ONLY (BUILT_IN_GOACC_GANG_SIZE, "goacc_gang_size",
+			BT_FN_INT, ATTR_NOTHROW_LEAF_LIST)
+DEF_GOACC_BUILTIN_ONLY (BUILT_IN_GOACC_WORKER_ID, "goacc_worker_id",
+			BT_FN_INT, ATTR_NOTHROW_LEAF_LIST)
+DEF_GOACC_BUILTIN_ONLY (BUILT_IN_GOACC_WORKER_SIZE, "goacc_worker_size",
+			BT_FN_INT, ATTR_NOTHROW_LEAF_LIST)
+DEF_GOACC_BUILTIN_ONLY (BUILT_IN_GOACC_VECTOR_ID, "goacc_vector_id",
+			BT_FN_INT, ATTR_NOTHROW_LEAF_LIST)
+DEF_GOACC_BUILTIN_ONLY (BUILT_IN_GOACC_VECTOR_SIZE, "goacc_vector_size",
+			BT_FN_INT, ATTR_NOTHROW_LEAF_LIST)
+
 DEF_GOMP_BUILTIN (BUILT_IN_OMP_GET_THREAD_NUM, "omp_get_thread_num",
 		  BT_FN_INT, ATTR_CONST_NOTHROW_LEAF_LIST)
 DEF_GOMP_BUILTIN (BUILT_IN_OMP_GET_NUM_THREADS, "omp_get_num_threads",
diff --git a/gcc/omp-general.c b/gcc/omp-general.c
index 0f53661..37262ed 100644
--- a/gcc/omp-general.c
+++ b/gcc/omp-general.c
@@ -615,6 +615,9 @@ oacc_get_fn_attrib (tree fn)
 int
 oacc_get_fn_dim_size (tree fn, int axis)
 {
+  if (lookup_attribute ("omp declare target", DECL_ATTRIBUTES (fn)))
+    return 0;
+
   tree attrs = oacc_get_fn_attrib (fn);
 
   gcc_assert (axis < GOMP_DIM_MAX);
diff --git a/gcc/omp-offload.c b/gcc/omp-offload.c
index 9d5b8be..8ef3c89 100644
--- a/gcc/omp-offload.c
+++ b/gcc/omp-offload.c
@@ -226,20 +226,28 @@ omp_finish_file (void)
 }
 
 /* Call dim_pos (POS == true) or dim_size (POS == false) builtins for
-   axis DIM.  Return a tmp var holding the result.  */
+   axis DIM.  Set the call result to lhs.  */
 
-static tree
-oacc_dim_call (bool pos, int dim, gimple_seq *seq)
+static gimple *
+oacc_dim_call_1 (bool pos, int dim, tree lhs)
 {
   tree arg = build_int_cst (unsigned_type_node, dim);
-  tree size = create_tmp_var (integer_type_node);
   enum internal_fn fn = pos ? IFN_GOACC_DIM_POS : IFN_GOACC_DIM_SIZE;
   gimple *call = gimple_build_call_internal (fn, 1, arg);
+  gimple_call_set_lhs (call, lhs);
+  return call;
+}
 
-  gimple_call_set_lhs (call, size);
-  gimple_seq_add_stmt (seq, call);
+/* Call dim_pos (POS == true) or dim_size (POS == false) builtins for
+   axis DIM.  Return a tmp var holding the result.  */
 
-  return size;
+static tree
+oacc_dim_call (bool pos, int dim, gimple_seq *seq)
+{
+  tree lhs = create_tmp_var (integer_type_node);
+  gimple *call = oacc_dim_call_1 (pos, dim, lhs);
+  gimple_seq_add_stmt (seq, call);
+  return lhs;
 }
 
 /* Find the number of threads (POS = false), or thread number (POS =
@@ -1451,6 +1459,52 @@ default_goacc_reduction (gcall *call)
   gsi_replace_with_seq (&gsi, seq, true);
 }
 
+/* Fold __builtin_goacc_{gang,worker,vector}_{id,size}.  */
+
+static gimple *
+fold_goacc_dim_id_size (gcall *call)
+{
+  tree fndecl = gimple_call_fndecl (call);
+
+  int dim;
+  switch (DECL_FUNCTION_CODE (fndecl))
+    {
+    case BUILT_IN_GOACC_GANG_ID:
+    case BUILT_IN_GOACC_GANG_SIZE:
+      dim = GOMP_DIM_GANG;
+      break;
+    case BUILT_IN_GOACC_WORKER_ID:
+    case BUILT_IN_GOACC_WORKER_SIZE:
+      dim = GOMP_DIM_WORKER;
+      break;
+    case BUILT_IN_GOACC_VECTOR_ID:
+    case BUILT_IN_GOACC_VECTOR_SIZE:
+      dim = GOMP_DIM_VECTOR;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  bool pos;
+  switch (DECL_FUNCTION_CODE (fndecl))
+    {
+    case BUILT_IN_GOACC_GANG_ID:
+    case BUILT_IN_GOACC_WORKER_ID:
+    case BUILT_IN_GOACC_VECTOR_ID:
+      pos = true;
+      break;
+    case BUILT_IN_GOACC_GANG_SIZE:
+    case BUILT_IN_GOACC_WORKER_SIZE:
+    case BUILT_IN_GOACC_VECTOR_SIZE:
+      pos = false;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  return oacc_dim_call_1 (pos, dim, gimple_call_lhs (call));
+}
+
 /* Main entry point for oacc transformations which run on the device
    compiler after LTO, so we know what the target device is at this
    point (including the host fallback).  */
@@ -1552,6 +1606,28 @@ execute_oacc_device_lower ()
 	  }
 
 	gcall *call = as_a <gcall *> (stmt);
+	tree fndecl;
+	if (gimple_call_builtin_p (call)
+	    && (fndecl = gimple_call_fndecl (stmt)) != NULL_TREE
+	    && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
+	  switch (DECL_FUNCTION_CODE (fndecl))
+	    {
+	    case BUILT_IN_GOACC_GANG_ID:
+	    case BUILT_IN_GOACC_WORKER_ID:
+	    case BUILT_IN_GOACC_VECTOR_ID:
+	    case BUILT_IN_GOACC_GANG_SIZE:
+	    case BUILT_IN_GOACC_WORKER_SIZE:
+	    case BUILT_IN_GOACC_VECTOR_SIZE:
+	      {
+		gimple *repl = fold_goacc_dim_id_size (call);
+		gsi_replace (&gsi, repl, false);
+		gsi_next (&gsi);
+		continue;
+	      }
+	    default:
+	      break;
+	    }
+
 	if (!gimple_call_internal_p (call))
 	  {
 	    gsi_next (&gsi);
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/gang-static-2.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/gang-static-2.c
index 6de739a..99a306a 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/gang-static-2.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/gang-static-2.c
@@ -1,25 +1,22 @@
-/* { dg-do run { target openacc_nvidia_accel_selected } } */
-/* This code uses nvptx inline assembly guarded with acc_on_device, which is
-   not optimized away at -O0, and then confuses the target assembler.
-   { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */
-
 #include <assert.h>
 #include <openacc.h>
 
 #define N 100
 
 #define GANG_ID(I)						\
-  (acc_on_device (acc_device_nvidia)				\
-   ? ({unsigned __r;						\
-       __asm__ volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (__r));	\
-       __r; }) : (I))
+  (acc_on_device (acc_device_not_host)				\
+   ? __builtin_goacc_gang_id ()					\
+   : (I))
 
 void
 test_static(int *a, int num_gangs, int sarg)
 {
   int i, j;
 
-  if (sarg == 0)
+  if (acc_on_device (acc_device_host))
+    return;
+
+   if (sarg == 0)
     sarg = 1;
 
   for (i = 0; i < N / sarg; i++)
@@ -32,6 +29,9 @@ test_nonstatic(int *a, int gangs)
 {
   int i, j;
 
+  if (acc_on_device (acc_device_host))
+    return;
+
   for (i = 0; i < N; i+=gangs)
     for (j = 0; j < gangs; j++)
       assert (a[i+j] == i/gangs);
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c
index 863b6b3..8920ec2 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c
@@ -1,7 +1,3 @@
-/* This code uses nvptx inline assembly guarded with acc_on_device, which is
-   not optimized away at -O0, and then confuses the target assembler.
-   { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */
-
 /* { dg-additional-options "-fopenacc-dim=32" } */
 
 #include <stdio.h>
@@ -79,15 +75,12 @@ static int __attribute__((noinline)) place ()
 {
   int r = 0;
 
-  if (acc_on_device (acc_device_nvidia))
-    {
-      int g = 0, w = 0, v = 0;
+  int g = 0, w = 0, v = 0;
+  g = __builtin_goacc_gang_id ();
+  w = __builtin_goacc_worker_id ();
+  v = __builtin_goacc_vector_id ();
+  r = (g << 16) | (w << 8) | v;
 
-      __asm__ volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (g));
-      __asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (w));
-      __asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (v));
-      r = (g << 16) | (w << 8) | v;
-    }
   return r;
 }
 
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-dim-default.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-dim-default.c
index e2b08c3..565e1e4 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-dim-default.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-dim-default.c
@@ -1,6 +1,3 @@
-/* This code uses nvptx inline assembly guarded with acc_on_device, which is
-   not optimized away at -O0, and then confuses the target assembler.  */
-/* { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */
 /* { dg-additional-options "-fopenacc-dim=16:16" } */
 
 #include <openacc.h>
@@ -13,13 +10,13 @@ static int __attribute__ ((noinline)) coord ()
 {
   int res = 0;
 
-  if (acc_on_device (acc_device_nvidia))
+  if (acc_on_device (acc_device_not_host))
     {
-      int g = 0, w = 0, v = 0;
+      int g, w, v;
 
-      __asm__ volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (g));
-      __asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (w));
-      __asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (v));
+      g = __builtin_goacc_gang_id ();
+      w = __builtin_goacc_worker_id ();
+      v = __builtin_goacc_vector_id ();
       res = (1 << 24) | (g << 16) | (w << 8) | v;
     }
   return res;
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-g-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-g-1.c
index ae1d588..1ceee00 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-g-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-g-1.c
@@ -1,8 +1,5 @@
-/* This code uses nvptx inline assembly guarded with acc_on_device, which is
-   not optimized away at -O0, and then confuses the target assembler.
-   { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */
-
 #include <stdio.h>
+#include <openacc.h>
 
 #define N (32*32*32+17)
 int main ()
@@ -20,13 +17,12 @@ int main ()
 #pragma acc loop gang
     for (unsigned ix = 0; ix < N; ix++)
       {
-	if (__builtin_acc_on_device (5))
+	if (acc_on_device (acc_device_not_host))
 	  {
-	    int g = 0, w = 0, v = 0;
-
-	    __asm__ volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (g));
-	    __asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (w));
-	    __asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (v));
+	    int g, w, v;
+	    g = __builtin_goacc_gang_id ();
+	    w = __builtin_goacc_worker_id ();
+	    v = __builtin_goacc_vector_id ();
 	    ary[ix] = (g << 16) | (w << 8) | v;
 	    ondev = 1;
 	  }
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-g-2.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-g-2.c
index c06d861..f23c88a 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-g-2.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-g-2.c
@@ -1,8 +1,5 @@
-/* This code uses nvptx inline assembly guarded with acc_on_device, which is
-   not optimized away at -O0, and then confuses the target assembler.
-   { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */
-
 #include <stdio.h>
+#include <openacc.h>
 
 #define N (32*32*32+17)
 int main ()
@@ -20,13 +17,13 @@ int main ()
 #pragma acc loop gang (static:1)
     for (unsigned ix = 0; ix < N; ix++)
       {
-	if (__builtin_acc_on_device (5))
+	if (acc_on_device (acc_device_not_host))
 	  {
-	    int g = 0, w = 0, v = 0;
+	    int g, w, v;
 
-	    __asm__ volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (g));
-	    __asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (w));
-	    __asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (v));
+	    g = __builtin_goacc_gang_id ();
+	    w = __builtin_goacc_worker_id ();
+	    v = __builtin_goacc_vector_id ();
 	    ary[ix] = (g << 16) | (w << 8) | v;
 	    ondev = 1;
 	  }
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-gwv-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-gwv-1.c
index 42b612a..b026c8f 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-gwv-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-gwv-1.c
@@ -1,8 +1,5 @@
-/* This code uses nvptx inline assembly guarded with acc_on_device, which is
-   not optimized away at -O0, and then confuses the target assembler.
-   { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */
-
 #include <stdio.h>
+#include <openacc.h>
 
 #define N (32*32*32+17)
 int main ()
@@ -20,13 +17,14 @@ int main ()
 #pragma acc loop gang worker vector
     for (unsigned ix = 0; ix < N; ix++)
       {
-	if (__builtin_acc_on_device (5))
+	if (acc_on_device (acc_device_not_host))
 	  {
-	    int g = 0, w = 0, v = 0;
+	    int g, w, v;
+
+	    g = __builtin_goacc_gang_id ();
+	    w = __builtin_goacc_worker_id ();
+	    v = __builtin_goacc_vector_id ();
 
-	    __asm__ volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (g));
-	    __asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (w));
-	    __asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (v));
 	    ary[ix] = (g << 16) | (w << 8) | v;
 	    ondev = 1;
 	  }
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-g-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-g-1.c
index 929e01c..70a0376 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-g-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-g-1.c
@@ -1,8 +1,5 @@
-/* This code uses nvptx inline assembly guarded with acc_on_device, which is
-   not optimized away at -O0, and then confuses the target assembler.
-   { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */
-
 #include <stdio.h>
+#include <openacc.h>
 
 #define N (32*32*32+17)
 int main ()
@@ -18,13 +15,13 @@ int main ()
       {
 	int val = ix;
 	
-	if (__builtin_acc_on_device (5))
+	if (acc_on_device (acc_device_not_host))
 	  {
-	    int g = 0, w = 0, v = 0;
+	    int g, w, v;
 
-	    __asm__ volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (g));
-	    __asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (w));
-	    __asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (v));
+	    g = __builtin_goacc_gang_id ();
+	    w = __builtin_goacc_worker_id ();
+	    v = __builtin_goacc_vector_id ();
 	    val = (g << 16) | (w << 8) | v;
 	    ondev = 1;
 	  }
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-gwv-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-gwv-1.c
index 4ae4b7c..3535a2a 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-gwv-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-gwv-1.c
@@ -1,8 +1,5 @@
-/* This code uses nvptx inline assembly guarded with acc_on_device, which is
-   not optimized away at -O0, and then confuses the target assembler.
-   { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */
-
 #include <stdio.h>
+#include <openacc.h>
 
 #define N (32*32*32+17)
 int main ()
@@ -18,13 +15,13 @@ int main ()
       {
 	int val = ix;
 	
-	if (__builtin_acc_on_device (5))
+	if (acc_on_device (acc_device_not_host))
 	  {
-	    int g = 0, w = 0, v = 0;
+	    int g, w, v;
 
-	    __asm__ volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (g));
-	    __asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (w));
-	    __asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (v));
+	    g = __builtin_goacc_gang_id ();
+	    w = __builtin_goacc_worker_id ();
+	    v = __builtin_goacc_vector_id ();
 	    val = (g << 16) | (w << 8) | v;
 	    ondev = 1;
 	  }
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-v-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-v-1.c
index 0556455..804a8ce 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-v-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-v-1.c
@@ -1,8 +1,5 @@
-/* This code uses nvptx inline assembly guarded with acc_on_device, which is
-   not optimized away at -O0, and then confuses the target assembler.
-   { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */
-
 #include <stdio.h>
+#include <openacc.h>
 
 #define N (32*32*32+17)
 
@@ -19,13 +16,13 @@ int main ()
       {
 	int val = ix;
 	
-	if (__builtin_acc_on_device (5))
+	if (acc_on_device (acc_device_not_host))
 	  {
-	    int g = 0, w = 0, v = 0;
+	    int g, w, v;
 
-	    __asm__ volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (g));
-	    __asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (w));
-	    __asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (v));
+	    g = __builtin_goacc_gang_id ();
+	    w = __builtin_goacc_worker_id ();
+	    v = __builtin_goacc_vector_id ();
 	    val = (g << 16) | (w << 8) | v;
 	    ondev = 1;
 	  }
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-v-2.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-v-2.c
index 16d8f9f..56d8c8c 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-v-2.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-v-2.c
@@ -1,8 +1,5 @@
-/* This code uses nvptx inline assembly guarded with acc_on_device, which is
-   not optimized away at -O0, and then confuses the target assembler.
-   { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */
-
 #include <stdio.h>
+#include <openacc.h>
 
 #define N (32*32*32+17)
 
@@ -21,13 +18,13 @@ int main ()
       {
 	int val = ix;
 	
-	if (__builtin_acc_on_device (5))
+	if (acc_on_device (acc_device_not_host))
 	  {
-	    int g = 0, w = 0, v = 0;
+	    int g, w, v;
 
-	    __asm__ volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (g));
-	    __asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (w));
-	    __asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (v));
+	    g = __builtin_goacc_gang_id ();
+	    w = __builtin_goacc_worker_id ();
+	    v = __builtin_goacc_vector_id ();
 	    val = (g << 16) | (w << 8) | v;
 	    ondev = 1;
 	  }
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-w-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-w-1.c
index 19021d9..bbf4110 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-w-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-w-1.c
@@ -1,8 +1,5 @@
-/* This code uses nvptx inline assembly guarded with acc_on_device, which is
-   not optimized away at -O0, and then confuses the target assembler.
-   { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */
-
 #include <stdio.h>
+#include <openacc.h>
 
 #define N (32*32*32+17)
 int main ()
@@ -18,13 +15,13 @@ int main ()
       {
 	int val = ix;
 	
-	if (__builtin_acc_on_device (5))
+	if (acc_on_device (acc_device_not_host))
 	  {
-	    int g = 0, w = 0, v = 0;
+	    int g, w, v;
 
-	    __asm__ volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (g));
-	    __asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (w));
-	    __asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (v));
+	    g = __builtin_goacc_gang_id ();
+	    w = __builtin_goacc_worker_id ();
+	    v = __builtin_goacc_vector_id ();
 	    val = (g << 16) | (w << 8) | v;
 	    ondev = 1;
 	  }
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-w-2.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-w-2.c
index f0c9d81..d06485b 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-w-2.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-w-2.c
@@ -1,8 +1,5 @@
-/* This code uses nvptx inline assembly guarded with acc_on_device, which is
-   not optimized away at -O0, and then confuses the target assembler.
-   { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */
-
 #include <stdio.h>
+#include <openacc.h>
 
 #define N (32*32*32+17)
 int main ()
@@ -20,13 +17,13 @@ int main ()
       {
 	int val = ix;
 	
-	if (__builtin_acc_on_device (5))
+	if (acc_on_device (acc_device_not_host))
 	  {
-	    int g = 0, w = 0, v = 0;
+	    int g, w, v;
 
-	    __asm__ volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (g));
-	    __asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (w));
-	    __asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (v));
+	    g = __builtin_goacc_gang_id ();
+	    w = __builtin_goacc_worker_id ();
+	    v = __builtin_goacc_vector_id ();
 	    val = (g << 16) | (w << 8) | v;
 	    ondev = 1;
 	  }
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-wv-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-wv-1.c
index 0fec2dc..1d9fc2d 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-wv-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-wv-1.c
@@ -2,6 +2,7 @@
 /* { dg-additional-options "-O2" } */
 
 #include <stdio.h>
+#include <openacc.h>
 
 #define N (32*32*32+17)
 int main ()
@@ -17,13 +18,13 @@ int main ()
       {
 	int val = ix;
 	
-	if (__builtin_acc_on_device (5))
+	if (acc_on_device (acc_device_not_host))
 	  {
-	    int g = 0, w = 0, v = 0;
+	    int g, w, v;
 
-	    __asm__ volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (g));
-	    __asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (w));
-	    __asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (v));
+	    g = __builtin_goacc_gang_id ();
+	    w = __builtin_goacc_worker_id ();
+	    v = __builtin_goacc_vector_id ();
 	    val = (g << 16) | (w << 8) | v;
 	    ondev = 1;
 	  }
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-v-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-v-1.c
index 2974807..d9262be 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-v-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-v-1.c
@@ -1,8 +1,5 @@
-/* This code uses nvptx inline assembly guarded with acc_on_device, which is
-   not optimized away at -O0, and then confuses the target assembler.
-   { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */
-
 #include <stdio.h>
+#include <openacc.h>
 
 #define N (32*32*32+17)
 int main ()
@@ -20,13 +17,13 @@ int main ()
 #pragma acc loop vector
     for (unsigned ix = 0; ix < N; ix++)
       {
-	if (__builtin_acc_on_device (5))
+	if (acc_on_device (acc_device_not_host))
 	  {
-	    int g = 0, w = 0, v = 0;
+	    int g, w, v;
 
-	    __asm__ volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (g));
-	    __asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (w));
-	    __asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (v));
+	    g = __builtin_goacc_gang_id ();
+	    w = __builtin_goacc_worker_id ();
+	    v = __builtin_goacc_vector_id ();
 	    ary[ix] = (g << 16) | (w << 8) | v;
 	    ondev = 1;
 	  }
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-w-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-w-1.c
index 33b6eae..07b77eb 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-w-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-w-1.c
@@ -1,8 +1,5 @@
-/* This code uses nvptx inline assembly guarded with acc_on_device, which is
-   not optimized away at -O0, and then confuses the target assembler.
-   { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */
-
 #include <stdio.h>
+#include <openacc.h>
 
 #define N (32*32*32+17)
 int main ()
@@ -20,13 +17,13 @@ int main ()
 #pragma acc loop worker
     for (unsigned ix = 0; ix < N; ix++)
       {
-	if (__builtin_acc_on_device (5))
+	if (acc_on_device (acc_device_not_host))
 	  {
-	    int g = 0, w = 0, v = 0;
+	    int g, w, v;
 
-	    __asm__ volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (g));
-	    __asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (w));
-	    __asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (v));
+	    g = __builtin_goacc_gang_id ();
+	    w = __builtin_goacc_worker_id ();
+	    v = __builtin_goacc_vector_id ();
 	    ary[ix] = (g << 16) | (w << 8) | v;
 	    ondev = 1;
 	  }
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-wv-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-wv-1.c
index 578cfad..d4e048d 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-wv-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-wv-1.c
@@ -1,8 +1,5 @@
-/* This code uses nvptx inline assembly guarded with acc_on_device, which is
-   not optimized away at -O0, and then confuses the target assembler.
-   { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */
-
 #include <stdio.h>
+#include <openacc.h>
 
 #define N (32*32*32+17)
 int main ()
@@ -20,13 +17,13 @@ int main ()
 #pragma acc loop worker vector
     for (unsigned ix = 0; ix < N; ix++)
       {
-	if (__builtin_acc_on_device (5))
+	if (acc_on_device (acc_device_not_host))
 	  {
-	    int g = 0, w = 0, v = 0;
+	    int g, w, v;
 
-	    __asm__ volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (g));
-	    __asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (w));
-	    __asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (v));
+	    g = __builtin_goacc_gang_id ();
+	    w = __builtin_goacc_worker_id ();
+	    v = __builtin_goacc_vector_id ();
 	    ary[ix] = (g << 16) | (w << 8) | v;
 	    ondev = 1;
 	  }
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
index 8308f7c..0b19fde 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
@@ -12,11 +12,7 @@ static unsigned int __attribute__ ((optimize ("O2"))) acc_gang ()
   if (acc_on_device ((int) acc_device_host))
     return 0;
   else if (acc_on_device ((int) acc_device_nvidia))
-    {
-      unsigned int r;
-      asm volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (r));
-      return r;
-    }
+    return __builtin_goacc_gang_id ();
   else
     __builtin_abort ();
 }
@@ -27,11 +23,7 @@ static unsigned int __attribute__ ((optimize ("O2"))) acc_worker ()
   if (acc_on_device ((int) acc_device_host))
     return 0;
   else if (acc_on_device ((int) acc_device_nvidia))
-    {
-      unsigned int r;
-      asm volatile ("mov.u32 %0,%%tid.y;" : "=r" (r));
-      return r;
-    }
+    return __builtin_goacc_worker_id ();
   else
     __builtin_abort ();
 }
@@ -42,11 +34,7 @@ static unsigned int __attribute__ ((optimize ("O2"))) acc_vector ()
   if (acc_on_device ((int) acc_device_host))
     return 0;
   else if (acc_on_device ((int) acc_device_nvidia))
-    {
-      unsigned int r;
-      asm volatile ("mov.u32 %0,%%tid.x;" : "=r" (r));
-      return r;
-    }
+    return __builtin_goacc_vector_id ();
   else
     __builtin_abort ();
 }
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-g-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-g-1.c
index b6ab713..7055cd3 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-g-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-g-1.c
@@ -1,8 +1,5 @@
-/* This code uses nvptx inline assembly guarded with acc_on_device, which is
-   not optimized away at -O0, and then confuses the target assembler.
-   { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */
-
 #include <stdio.h>
+#include <openacc.h>
 
 #define N (32*32*32+17)
 
@@ -12,13 +9,13 @@ void __attribute__ ((noinline)) gang (int ary[N])
 #pragma acc loop gang
     for (unsigned ix = 0; ix < N; ix++)
       {
-	if (__builtin_acc_on_device (5))
+	if (acc_on_device (acc_device_not_host))
 	  {
-	    int g = 0, w = 0, v = 0;
+	    int g, w, v;
 
-	    __asm__ volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (g));
-	    __asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (w));
-	    __asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (v));
+	    g = __builtin_goacc_gang_id ();
+	    w = __builtin_goacc_worker_id ();
+	    v = __builtin_goacc_vector_id ();
 	    ary[ix] = (g << 16) | (w << 8) | v;
 	  }
 	else
@@ -38,7 +35,7 @@ int main ()
   
 #pragma acc parallel num_gangs(32) copy(ary) copy(ondev)
   {
-    ondev = __builtin_acc_on_device (5);
+    ondev = acc_on_device (acc_device_not_host);
     gang (ary);
   }
 
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-gwv-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-gwv-1.c
index ace2f49..b9d36a8 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-gwv-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-gwv-1.c
@@ -1,8 +1,5 @@
-/* This code uses nvptx inline assembly guarded with acc_on_device, which is
-   not optimized away at -O0, and then confuses the target assembler.
-   { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */
-
 #include <stdio.h>
+#include <openacc.h>
 
 #define N (32*32*32+17)
 
@@ -12,13 +9,13 @@ void __attribute__ ((noinline)) gang (int ary[N])
 #pragma acc loop gang worker vector
   for (unsigned ix = 0; ix < N; ix++)
     {
-      if (__builtin_acc_on_device (5))
+      if (acc_on_device (acc_device_not_host))
 	{
-	  int g = 0, w = 0, v = 0;
+	  int g, w, v;
 
-	  __asm__ volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (g));
-	  __asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (w));
-	  __asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (v));
+	  g = __builtin_goacc_gang_id ();
+	  w = __builtin_goacc_worker_id ();
+	  v = __builtin_goacc_vector_id ();
 	  ary[ix] = (g << 16) | (w << 8) | v;
 	}
       else
@@ -38,7 +35,7 @@ int main ()
   
 #pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) copy(ary) copy(ondev)
   {
-    ondev = __builtin_acc_on_device (5);
+    ondev = acc_on_device (acc_device_not_host);
     gang (ary);
   }
 
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-v-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-v-1.c
index 2503e8d..8c553d5 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-v-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-v-1.c
@@ -1,8 +1,5 @@
-/* This code uses nvptx inline assembly guarded with acc_on_device, which is
-   not optimized away at -O0, and then confuses the target assembler.
-   { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */
-
 #include <stdio.h>
+#include <openacc.h>
 
 #define N (32*32*32+17)
 
@@ -12,13 +9,13 @@ void __attribute__ ((noinline)) vector (int ary[N])
 #pragma acc loop vector
   for (unsigned ix = 0; ix < N; ix++)
     {
-      if (__builtin_acc_on_device (5))
+      if (acc_on_device (acc_device_not_host))
 	{
-	  int g = 0, w = 0, v = 0;
+	  int g, w, v;
 
-	  __asm__ volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (g));
-	  __asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (w));
-	  __asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (v));
+	  g = __builtin_goacc_gang_id ();
+	  w = __builtin_goacc_worker_id ();
+	  v = __builtin_goacc_vector_id ();
 	  ary[ix] = (g << 16) | (w << 8) | v;
 	}
       else
@@ -38,7 +35,7 @@ int main ()
   
 #pragma acc parallel vector_length(32) copy(ary) copy(ondev)
   {
-    ondev = __builtin_acc_on_device (5);
+    ondev = acc_on_device (acc_device_not_host);
     vector (ary);
   }
 
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-w-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-w-1.c
index 80cd462..7847336 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-w-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-w-1.c
@@ -1,8 +1,5 @@
-/* This code uses nvptx inline assembly guarded with acc_on_device, which is
-   not optimized away at -O0, and then confuses the target assembler.
-   { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */
-
 #include <stdio.h>
+#include <openacc.h>
 
 #define N (32*32*32+17)
 
@@ -12,13 +9,13 @@ void __attribute__ ((noinline)) worker (int ary[N])
 #pragma acc loop worker
   for (unsigned ix = 0; ix < N; ix++)
     {
-      if (__builtin_acc_on_device (5))
+      if (acc_on_device (acc_device_not_host))
 	{
-	  int g = 0, w = 0, v = 0;
+	  int g, w, v;
 
-	  __asm__ volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (g));
-	  __asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (w));
-	  __asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (v));
+	  g = __builtin_goacc_gang_id ();
+	  w = __builtin_goacc_worker_id ();
+	  v = __builtin_goacc_vector_id ();
 	  ary[ix] = (g << 16) | (w << 8) | v;
 	}
       else
@@ -38,7 +35,7 @@ int main ()
   
 #pragma acc parallel num_workers(32) vector_length(32) copy(ary) copy(ondev)
   {
-    ondev = __builtin_acc_on_device (5);
+    ondev = acc_on_device (acc_device_not_host);
     worker (ary);
   }
 
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-wv-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-wv-1.c
index 5e45fad..748978d 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-wv-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-wv-1.c
@@ -1,8 +1,5 @@
-/* This code uses nvptx inline assembly guarded with acc_on_device, which is
-   not optimized away at -O0, and then confuses the target assembler.
-   { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */
-
 #include <stdio.h>
+#include <openacc.h>
 
 #define N (32*32*32+17)
 
@@ -12,13 +9,13 @@ void __attribute__ ((noinline)) worker (int ary[N])
 #pragma acc loop worker vector
   for (unsigned ix = 0; ix < N; ix++)
     {
-      if (__builtin_acc_on_device (5))
+      if (acc_on_device (acc_device_not_host))
 	{
-	  int g = 0, w = 0, v = 0;
+	  int g, w, v;
 
-	  __asm__ volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (g));
-	  __asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (w));
-	  __asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (v));
+	  g = __builtin_goacc_gang_id ();
+	  w = __builtin_goacc_worker_id ();
+	  v = __builtin_goacc_vector_id ();
 	  ary[ix] = (g << 16) | (w << 8) | v;
 	}
       else
@@ -38,7 +35,7 @@ int main ()
   
 #pragma acc parallel num_workers(32) vector_length(32) copy(ary) copy(ondev)
   {
-    ondev = __builtin_acc_on_device (5);
+    ondev = acc_on_device (acc_device_not_host);
     worker (ary);
   }
 
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-wv-2.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-wv-2.c
index b5cbc90..8a1f60f 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-wv-2.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-wv-2.c
@@ -1,7 +1,3 @@
-/* This code uses nvptx inline assembly guarded with acc_on_device, which is
-   not optimized away at -O0, and then confuses the target assembler.
-   { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */
-
 #include <stdio.h>
 #include <openacc.h>
 
@@ -11,15 +7,13 @@
 #define HEIGHT 32
 
 #define WORK_ID(I,N)						\
-  (acc_on_device (acc_device_nvidia)				\
-   ? ({unsigned __r;						\
-       __asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (__r));	\
-       __r; }) : (I % N))
+  (acc_on_device (acc_device_not_host)				\
+   ? __builtin_goacc_worker_id ()				\
+   : (I % N))
 #define VEC_ID(I,N)						\
-  (acc_on_device (acc_device_nvidia)				\
-   ? ({unsigned __r;						\
-       __asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (__r));	\
-       __r; }) : (I % N))
+  (acc_on_device (acc_device_not_host)				\
+   ? __builtin_goacc_vector_id ()				\
+   : (I % N))
 
 #pragma acc routine worker
 void __attribute__ ((noinline))
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/tile-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/tile-1.c
index 8dcb956..4669291 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/tile-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/tile-1.c
@@ -1,7 +1,3 @@
-/* This code uses nvptx inline assembly guarded with acc_on_device, which is
-   not optimized away at -O0, and then confuses the target assembler.
-   { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */
-
 /* { dg-additional-options "-fopenacc-dim=32" } */
 
 #include <stdio.h>
@@ -79,13 +75,13 @@ static int __attribute__((noinline)) place ()
 {
   int r = 0;
 
-  if (acc_on_device (acc_device_nvidia))
+  if (acc_on_device (acc_device_not_host))
     {
-      int g = 0, w = 0, v = 0;
+      int g, w, v;
 
-      __asm__ volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (g));
-      __asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (w));
-      __asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (v));
+      g = __builtin_goacc_gang_id ();
+      w = __builtin_goacc_worker_id ();
+      v = __builtin_goacc_vector_id ();
       r = (g << 16) | (w << 8) | v;
     }
   return r;

Reply via email to