[PATCH] D85223: [CUDA][HIP] Support accessing static device variable in host code for -fgpu-rdc

2021-08-20 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl added inline comments.



Comment at: clang/lib/CodeGen/CodeGenModule.cpp:6265-6268
+void CodeGenModule::printPostfixForExternalizedStaticVar(
+llvm::raw_ostream &OS) const {
+  OS << ".static." << getContext().getCUIDHash();
+}

Hahnfeld wrote:
> I've tried to use this with CUDA, but it errors out because `.` is not 
> allowed in identifiers. Could you check if https://reviews.llvm.org/D108456 
> also works for HIP?
I will try it with our CI and get back to you.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D85223/new/

https://reviews.llvm.org/D85223

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D85223: [CUDA][HIP] Support accessing static device variable in host code for -fgpu-rdc

2021-08-20 Thread Jonas Hahnfeld via Phabricator via cfe-commits
Hahnfeld added inline comments.



Comment at: clang/lib/CodeGen/CodeGenModule.cpp:6265-6268
+void CodeGenModule::printPostfixForExternalizedStaticVar(
+llvm::raw_ostream &OS) const {
+  OS << ".static." << getContext().getCUIDHash();
+}

I've tried to use this with CUDA, but it errors out because `.` is not allowed 
in identifiers. Could you check if https://reviews.llvm.org/D108456 also works 
for HIP?


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D85223/new/

https://reviews.llvm.org/D85223

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D85223: [CUDA][HIP] Support accessing static device variable in host code for -fgpu-rdc

2021-02-24 Thread Yaxun Liu via Phabricator via cfe-commits
This revision was automatically updated to reflect the committed changes.
yaxunl marked 6 inline comments as done.
Closed by commit rG47acdec1dd5d: [CUDA][HIP] Support accessing static device 
variable in host code for -fgpu-rdc (authored by yaxunl).
Herald added a project: clang.

Changed prior to commit:
  https://reviews.llvm.org/D85223?vs=322021&id=326223#toc

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D85223/new/

https://reviews.llvm.org/D85223

Files:
  clang/include/clang/AST/ASTContext.h
  clang/lib/AST/ASTContext.cpp
  clang/lib/CodeGen/CGCUDANV.cpp
  clang/lib/CodeGen/CodeGenModule.cpp
  clang/lib/CodeGen/CodeGenModule.h
  clang/test/CodeGenCUDA/device-var-linkage.cu
  clang/test/CodeGenCUDA/managed-var.cu
  clang/test/CodeGenCUDA/static-device-var-rdc.cu
  clang/test/SemaCUDA/static-device-var.cu

Index: clang/test/SemaCUDA/static-device-var.cu
===
--- /dev/null
+++ clang/test/SemaCUDA/static-device-var.cu
@@ -0,0 +1,50 @@
+// REQUIRES: x86-registered-target
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang_cc1 -triple nvptx -fcuda-is-device \
+// RUN:-emit-llvm -o - %s -fsyntax-only -verify=dev
+
+// RUN: %clang_cc1 -triple x86_64-gnu-linux \
+// RUN:-emit-llvm -o - %s -fsyntax-only -verify=host
+
+// Checks allowed usage of file-scope and function-scope static variables.
+
+// host-no-diagnostics
+
+#include "Inputs/cuda.h"
+
+// Checks static variables are allowed in device functions.
+
+__device__ void f1() {
+  const static int b = 123;
+  static int a;
+}
+
+// Checks static variables are allowd in global functions.
+
+__global__ void k1() {
+  const static int b = 123;
+  static int a;
+}
+
+// Checks static device and constant variables are allowed in device and
+// host functions, and static host variables are not allowed in device
+// functions.
+
+static __device__ int x;
+static __constant__ int y;
+static int z;
+
+__global__ void kernel(int *a) {
+  a[0] = x;
+  a[1] = y;
+  a[2] = z;
+  // dev-error@-1 {{reference to __host__ variable 'z' in __global__ function}}
+}
+
+int* getDeviceSymbol(int *x);
+
+void foo() {
+  getDeviceSymbol(&x);
+  getDeviceSymbol(&y);
+}
Index: clang/test/CodeGenCUDA/static-device-var-rdc.cu
===
--- /dev/null
+++ clang/test/CodeGenCUDA/static-device-var-rdc.cu
@@ -0,0 +1,97 @@
+// REQUIRES: x86-registered-target
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device \
+// RUN:   -fgpu-rdc -emit-llvm -o - -x hip %s | FileCheck \
+// RUN:   -check-prefixes=DEV,INT-DEV %s
+
+// RUN: %clang_cc1 -triple x86_64-gnu-linux \
+// RUN:   -fgpu-rdc -emit-llvm -o - -x hip %s | FileCheck \
+// RUN:   -check-prefixes=HOST,INT-HOST %s
+
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -cuid=abc \
+// RUN:   -fgpu-rdc -emit-llvm -o - -x hip %s > %t.dev
+// RUN: cat %t.dev | FileCheck -check-prefixes=DEV,EXT-DEV %s
+
+// RUN: %clang_cc1 -triple x86_64-gnu-linux -cuid=abc \
+// RUN:   -fgpu-rdc -emit-llvm -o - -x hip %s > %t.host
+// RUN: cat %t.host | FileCheck -check-prefixes=HOST,EXT-HOST %s
+
+// Check host and device compilations use the same postfixes for static
+// variable names.
+
+// RUN: cat %t.dev %t.host | FileCheck -check-prefix=POSTFIX %s
+
+#include "Inputs/cuda.h"
+
+// Test function scope static device variable, which should not be externalized.
+// DEV-DAG: @_ZZ6kernelPiPPKiE1w = internal addrspace(4) constant i32 1
+
+
+// HOST-DAG: @_ZL1x = internal global i32 undef
+// HOST-DAG: @_ZL1y = internal global i32 undef
+
+// Test normal static device variables
+// INT-DEV-DAG: @_ZL1x = dso_local addrspace(1) externally_initialized global i32 0
+// INT-HOST-DAG: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x\00"
+
+// Test externalized static device variables
+// EXT-DEV-DAG: @_ZL1x.static.[[HASH:.*]] = dso_local addrspace(1) externally_initialized global i32 0
+// EXT-HOST-DAG: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x.static.[[HASH:.*]]\00"
+
+// POSTFIX: @_ZL1x.static.[[HASH:.*]] = dso_local addrspace(1) externally_initialized global i32 0
+// POSTFIX: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x.static.[[HASH]]\00"
+
+static __device__ int x;
+
+// Test static device variables not used by host code should not be externalized
+// DEV-DAG: @_ZL2x2 = internal addrspace(1) global i32 0
+
+static __device__ int x2;
+
+// Test normal static device variables
+// INT-DEV-DAG: @_ZL1y = dso_local addrspace(4) externally_initialized global i32 0
+// INT-HOST-DAG: @[[DEVNAMEY:[0-9]+]] = {{.*}}c"_ZL1y\00"
+
+// Test externalized static device variables
+// EXT-DEV-DAG: @_ZL1y.static.[[HASH]] = dso_local addrspace(4) externally_initialized global i32 0
+// EXT-HOST-DAG: @[[DEVNAMEY:[0-9]+]] = {{.*}}c"_ZL1y.static.[[HASH]]\00"
+
+static __constant__ int y;
+
+// Test static host variable, which should not be externalized nor registered.
+/

[PATCH] D85223: [CUDA][HIP] Support accessing static device variable in host code for -fgpu-rdc

2021-02-09 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl marked 6 inline comments as done.
yaxunl added a comment.

In D85223#2551894 , @JonChesterfield 
wrote:

> This works around the limitations of the binary format nvptx and amdgpu are 
> using in the compiler. It's the wrong place in the stack to fix it - we could 
> introduce another symbol table in the binary to capture the 
> per-tu-between-arch scoping.
>
> However, if we later reach consensus on what to do in the elf instead, we can 
> still do that. In particular, embedding an elf for one arch in a named 
> section of an elf for a host arch is crude. This workaround seems acceptable 
> in the meantime.

Yes we should revisit this if there is a better solution.




Comment at: clang/test/CodeGenCUDA/device-var-linkage.cu:40
 // NORDC-DAG: @_ZL3sv1 = dso_local addrspace(1) externally_initialized global 
i32 0
-// RDC-DAG: @_ZL3sv1 = internal addrspace(1) global i32 0
+// RDC-DAG: @_ZL3sv1.static.[[HASH:b04fd23c98500190]] = dso_local addrspace(1) 
externally_initialized global i32 0
 // HOST-DAG: @_ZL3sv1 = internal global i32 undef

tra wrote:
> It should probably be a regex after `HASH:`, not the hash value itself.
will do



Comment at: clang/test/CodeGenCUDA/managed-var.cu:42
+// NORDC-D-DAG: @_ZL2sx = dso_local addrspace(1) externally_initialized global 
i32 addrspace(1)* null
+// RDC-D-DAG: @_ZL2sx.static.[[HASH:b04fd23c98500190]] = dso_local 
addrspace(1) externally_initialized global i32 addrspace(1)* null
 // HOST-DAG: @_ZL2sx.init = internal global i32 1

tra wrote:
> Same here.
will do



Comment at: clang/test/CodeGenCUDA/static-device-var-rdc.cu:34
+// Test externalized static device variables
+// EXT-DEV-DAG: @_ZL1x.static.[[HASH:b04fd23c98500190]] = dso_local 
addrspace(1) externally_initialized global i32 0
+// EXT-HOST-DAG: @[[DEVNAMEX:[0-9]+]] = 
{{.*}}c"_ZL1x.static.[[HASH:b04fd23c98500190]]\00"

tra wrote:
> ditto.
will do



Comment at: clang/test/SemaCUDA/static-device-var.cu:10
+
+// expected-no-diagnostics
+

tra wrote:
> A comment explaining what we're testing would be helpful. `no-diagnostics` 
> gives no clues about what is it we're looking for here.
> 
> 
will do



Comment at: clang/test/SemaCUDA/static-device-var.cu:14-22
+__device__ void f1() {
+  const static int b = 123;
+  static int a;
+}
+
+__global__ void k1() {
+  const static int b = 123;

tra wrote:
> So, this verifies that we're allowed to use static local vars in device code. 
> A comment would be useful.
will do



Comment at: clang/test/SemaCUDA/static-device-var.cu:23-37
+
+static __device__ int x;
+static __constant__ int y;
+
+__global__ void kernel(int *a) {
+  a[0] = x;
+  a[1] = y;

tra wrote:
> And this verifies that global static vars can be referenced from both host 
> and device. 
> I'd also add a negative test with `static int host_only;` and would verify 
> that we still don't allow accessing it from the device.
will do


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D85223/new/

https://reviews.llvm.org/D85223

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D85223: [CUDA][HIP] Support accessing static device variable in host code for -fgpu-rdc

2021-02-09 Thread Artem Belevich via Phabricator via cfe-commits
tra added a comment.

What breaks existing abstractions is that we produce N ELF objects from a 
single TU and the meaning of `static` becomes fuzzy. On one hand, we don't want 
that static symbol to be visible across objects on the same target, at the same 
time we do want it to be visible across host/device objects compiled from the 
same TU.  ELF does not have a way to express it. Making such symbols visible 
with an unique suffix seems to be a reasonable tradeoff. We probably have more 
options available for AMDGPU. E.g. as you've suggested, give runtime extra 
clues about referencing these variables across host/device boundary without 
resorting to making them externally visible. However, we don't have that 
flexibility for NVPTX.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D85223/new/

https://reviews.llvm.org/D85223

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D85223: [CUDA][HIP] Support accessing static device variable in host code for -fgpu-rdc

2021-02-09 Thread Jon Chesterfield via Phabricator via cfe-commits
JonChesterfield accepted this revision.
JonChesterfield added a comment.

This works around the limitations of the binary format nvptx and amdgpu are 
using in the compiler. It's the wrong place in the stack to fix it - we could 
introduce another symbol table in the binary to capture the per-tu-between-arch 
scoping.

However, if we later reach consensus on what to do in the elf instead, we can 
still do that. In particular, embedding an elf for one arch in a named section 
of an elf for a host arch is crude. This workaround seems acceptable in the 
meantime.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D85223/new/

https://reviews.llvm.org/D85223

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D85223: [CUDA][HIP] Support accessing static device variable in host code for -fgpu-rdc

2021-02-09 Thread Artem Belevich via Phabricator via cfe-commits
tra accepted this revision.
tra added a comment.
This revision is now accepted and ready to land.

LGTM with new test nits.

@JonChesterfield -- are you OK with the patch?




Comment at: clang/test/CodeGenCUDA/device-var-linkage.cu:40
 // NORDC-DAG: @_ZL3sv1 = dso_local addrspace(1) externally_initialized global 
i32 0
-// RDC-DAG: @_ZL3sv1 = internal addrspace(1) global i32 0
+// RDC-DAG: @_ZL3sv1.static.[[HASH:b04fd23c98500190]] = dso_local addrspace(1) 
externally_initialized global i32 0
 // HOST-DAG: @_ZL3sv1 = internal global i32 undef

It should probably be a regex after `HASH:`, not the hash value itself.



Comment at: clang/test/CodeGenCUDA/managed-var.cu:42
+// NORDC-D-DAG: @_ZL2sx = dso_local addrspace(1) externally_initialized global 
i32 addrspace(1)* null
+// RDC-D-DAG: @_ZL2sx.static.[[HASH:b04fd23c98500190]] = dso_local 
addrspace(1) externally_initialized global i32 addrspace(1)* null
 // HOST-DAG: @_ZL2sx.init = internal global i32 1

Same here.



Comment at: clang/test/CodeGenCUDA/static-device-var-rdc.cu:34
+// Test externalized static device variables
+// EXT-DEV-DAG: @_ZL1x.static.[[HASH:b04fd23c98500190]] = dso_local 
addrspace(1) externally_initialized global i32 0
+// EXT-HOST-DAG: @[[DEVNAMEX:[0-9]+]] = 
{{.*}}c"_ZL1x.static.[[HASH:b04fd23c98500190]]\00"

ditto.



Comment at: clang/test/SemaCUDA/static-device-var.cu:10
+
+// expected-no-diagnostics
+

A comment explaining what we're testing would be helpful. `no-diagnostics` 
gives no clues about what is it we're looking for here.





Comment at: clang/test/SemaCUDA/static-device-var.cu:14-22
+__device__ void f1() {
+  const static int b = 123;
+  static int a;
+}
+
+__global__ void k1() {
+  const static int b = 123;

So, this verifies that we're allowed to use static local vars in device code. A 
comment would be useful.



Comment at: clang/test/SemaCUDA/static-device-var.cu:23-37
+
+static __device__ int x;
+static __constant__ int y;
+
+__global__ void kernel(int *a) {
+  a[0] = x;
+  a[1] = y;

And this verifies that global static vars can be referenced from both host and 
device. 
I'd also add a negative test with `static int host_only;` and would verify that 
we still don't allow accessing it from the device.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D85223/new/

https://reviews.llvm.org/D85223

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D85223: [CUDA][HIP] Support accessing static device variable in host code for -fgpu-rdc

2021-02-07 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl updated this revision to Diff 322021.
yaxunl marked 3 inline comments as done.
yaxunl edited the summary of this revision.
yaxunl added a comment.

Revised by Artem's comments. Use CUID hash as postfix for static variable name.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D85223/new/

https://reviews.llvm.org/D85223

Files:
  clang/include/clang/AST/ASTContext.h
  clang/lib/AST/ASTContext.cpp
  clang/lib/CodeGen/CGCUDANV.cpp
  clang/lib/CodeGen/CodeGenModule.cpp
  clang/lib/CodeGen/CodeGenModule.h
  clang/test/CodeGenCUDA/device-var-linkage.cu
  clang/test/CodeGenCUDA/managed-var.cu
  clang/test/CodeGenCUDA/static-device-var-rdc.cu
  clang/test/SemaCUDA/static-device-var.cu

Index: clang/test/SemaCUDA/static-device-var.cu
===
--- /dev/null
+++ clang/test/SemaCUDA/static-device-var.cu
@@ -0,0 +1,37 @@
+// REQUIRES: x86-registered-target
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang_cc1 -triple nvptx -fcuda-is-device \
+// RUN:-emit-llvm -o - %s -fsyntax-only -verify
+
+// RUN: %clang_cc1 -triple x86_64-gnu-linux \
+// RUN:-emit-llvm -o - %s -fsyntax-only -verify
+
+// expected-no-diagnostics
+
+#include "Inputs/cuda.h"
+
+__device__ void f1() {
+  const static int b = 123;
+  static int a;
+}
+
+__global__ void k1() {
+  const static int b = 123;
+  static int a;
+}
+
+static __device__ int x;
+static __constant__ int y;
+
+__global__ void kernel(int *a) {
+  a[0] = x;
+  a[1] = y;
+}
+
+int* getDeviceSymbol(int *x);
+
+void foo() {
+  getDeviceSymbol(&x);
+  getDeviceSymbol(&y);
+}
Index: clang/test/CodeGenCUDA/static-device-var-rdc.cu
===
--- /dev/null
+++ clang/test/CodeGenCUDA/static-device-var-rdc.cu
@@ -0,0 +1,89 @@
+// REQUIRES: x86-registered-target
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device \
+// RUN:   -fgpu-rdc -emit-llvm -o - -x hip %s | FileCheck \
+// RUN:   -check-prefixes=DEV,INT-DEV %s
+
+// RUN: %clang_cc1 -triple x86_64-gnu-linux \
+// RUN:   -fgpu-rdc -emit-llvm -o - -x hip %s | FileCheck \
+// RUN:   -check-prefixes=HOST,INT-HOST %s
+
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -cuid=abc \
+// RUN:   -fgpu-rdc -emit-llvm -o - -x hip %s | FileCheck \
+// RUN:   -check-prefixes=DEV,EXT-DEV %s
+
+// RUN: %clang_cc1 -triple x86_64-gnu-linux -cuid=abc \
+// RUN:   -fgpu-rdc -emit-llvm -o - -x hip %s | FileCheck \
+// RUN:   -check-prefixes=HOST,EXT-HOST %s
+
+#include "Inputs/cuda.h"
+
+// Test function scope static device variable, which should not be externalized.
+// DEV-DAG: @_ZZ6kernelPiPPKiE1w = internal addrspace(4) constant i32 1
+
+
+// HOST-DAG: @_ZL1x = internal global i32 undef
+// HOST-DAG: @_ZL1y = internal global i32 undef
+
+// Test normal static device variables
+// INT-DEV-DAG: @_ZL1x = dso_local addrspace(1) externally_initialized global i32 0
+// INT-HOST-DAG: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x\00"
+
+// Test externalized static device variables
+// EXT-DEV-DAG: @_ZL1x.static.[[HASH:b04fd23c98500190]] = dso_local addrspace(1) externally_initialized global i32 0
+// EXT-HOST-DAG: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x.static.[[HASH:b04fd23c98500190]]\00"
+
+static __device__ int x;
+
+// Test static device variables not used by host code should not be externalized
+// DEV-DAG: @_ZL2x2 = internal addrspace(1) global i32 0
+
+static __device__ int x2;
+
+// Test normal static device variables
+// INT-DEV-DAG: @_ZL1y = dso_local addrspace(4) externally_initialized global i32 0
+// INT-HOST-DAG: @[[DEVNAMEY:[0-9]+]] = {{.*}}c"_ZL1y\00"
+
+// Test externalized static device variables
+// EXT-DEV-DAG: @_ZL1y.static.[[HASH]] = dso_local addrspace(4) externally_initialized global i32 0
+// EXT-HOST-DAG: @[[DEVNAMEY:[0-9]+]] = {{.*}}c"_ZL1y.static.[[HASH]]\00"
+
+static __constant__ int y;
+
+// Test static host variable, which should not be externalized nor registered.
+// HOST-DAG: @_ZL1z = internal global i32 0
+// DEV-NOT: @_ZL1z
+static int z;
+
+// Test static device variable in inline function, which should not be
+// externalized nor registered.
+// DEV-DAG: @_ZZ6devfunPPKiE1p = linkonce_odr addrspace(4) constant i32 2, comdat
+
+inline __device__ void devfun(const int ** b) {
+  const static int p = 2;
+  b[0] = &p;
+}
+
+__global__ void kernel(int *a, const int **b) {
+  const static int w = 1;
+  a[0] = x;
+  a[1] = y;
+  b[0] = &w;
+  b[1] = &x2;
+  devfun(b);
+}
+
+int* getDeviceSymbol(int *x);
+
+void foo() {
+  getDeviceSymbol(&x);
+  getDeviceSymbol(&y);
+  z = 123;
+}
+
+// HOST: __hipRegisterVar({{.*}}@_ZL1x {{.*}}@[[DEVNAMEX]]
+// HOST: __hipRegisterVar({{.*}}@_ZL1y {{.*}}@[[DEVNAMEY]]
+// HOST-NOT: __hipRegisterVar({{.*}}@_ZL2x2
+// HOST-NOT: __hipRegisterVar({{.*}}@_ZZ6kernelPiPPKiE1w
+// HOST-NOT: __hipRegisterVar({{.*}}@_ZZ6devfunPPKiE1p
Index: clang/test/CodeGenCUDA/managed-var.cu
==

[PATCH] D85223: [CUDA][HIP] Support accessing static device variable in host code for -fgpu-rdc

2021-02-07 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl marked 3 inline comments as done.
yaxunl added inline comments.



Comment at: clang/lib/AST/ASTContext.cpp:11446-11447
 bool ASTContext::mayExternalizeStaticVar(const Decl *D) const {
-  return !getLangOpts().GPURelocatableDeviceCode &&
+  return (!getLangOpts().CUID.empty() ||
+  !getLangOpts().GPURelocatableDeviceCode) &&
  ((D->hasAttr() &&

tra wrote:
> tra wrote:
> > `!(getLangOpts().GPURelocatableDeviceCode && getLangOpts().CUID.empty())`.
> > 
> > Maybe this should be broken down into something easier to read.
> > ```
> >   // Applies only to -fgpu-rdc or when we were given a CUID
> >   if (!getLangOpts().GPURelocatableDeviceCode || 
> > !getLangOpts().CUID.empty()))
> >   return false;
> >   // .. only file-scope static vars...
> >   auto *VD = dyn_cast(D);
> >   if (!(VD && VD->isFileVarDecl() && VD->getStorageClass() == SC_Static))
> >   return false;
> >   // .. with explicit __device__ or __constant__ attributes.
> >   return ((D->hasAttr() && 
> > !D->getAttr()->isImplicit()) ||
> >   (D->hasAttr() 
> > &&!D->getAttr()->isImplicit()));
> >   
> > ```
> BTW, does this mean that we'll externalize & uniquify the vars even w/o 
> `-fgpu-rdc` if CUID is given?
> 
> IMO `-fgpu-rdc` should remain the flag to control whether externalization is 
> needed.
> CUID controls the value of a unique suffix, if we need it, but should not 
> automatically enable externalization.
> 
> 
done



Comment at: clang/lib/AST/ASTContext.cpp:11446-11447
 bool ASTContext::mayExternalizeStaticVar(const Decl *D) const {
-  return !getLangOpts().GPURelocatableDeviceCode &&
+  return (!getLangOpts().CUID.empty() ||
+  !getLangOpts().GPURelocatableDeviceCode) &&
  ((D->hasAttr() &&

yaxunl wrote:
> tra wrote:
> > tra wrote:
> > > `!(getLangOpts().GPURelocatableDeviceCode && getLangOpts().CUID.empty())`.
> > > 
> > > Maybe this should be broken down into something easier to read.
> > > ```
> > >   // Applies only to -fgpu-rdc or when we were given a CUID
> > >   if (!getLangOpts().GPURelocatableDeviceCode || 
> > > !getLangOpts().CUID.empty()))
> > >   return false;
> > >   // .. only file-scope static vars...
> > >   auto *VD = dyn_cast(D);
> > >   if (!(VD && VD->isFileVarDecl() && VD->getStorageClass() == SC_Static))
> > >   return false;
> > >   // .. with explicit __device__ or __constant__ attributes.
> > >   return ((D->hasAttr() && 
> > > !D->getAttr()->isImplicit()) ||
> > >   (D->hasAttr() 
> > > &&!D->getAttr()->isImplicit()));
> > >   
> > > ```
> > BTW, does this mean that we'll externalize & uniquify the vars even w/o 
> > `-fgpu-rdc` if CUID is given?
> > 
> > IMO `-fgpu-rdc` should remain the flag to control whether externalization 
> > is needed.
> > CUID controls the value of a unique suffix, if we need it, but should not 
> > automatically enable externalization.
> > 
> > 
> done
mayExternalizeStaticVar returns true does not mean the static var must be 
externalized. mayExternalizeStaticVar only indicates the static var may be 
externalized. It is used to enable checking whether this var is used by host 
code.

For -fno-gpu-rdc, we only externalize a static variable if it is referenced by 
host code. If a static var is referenced by host code, -fno-gpu-rdc will change 
its linkage to external, but does not need to make the symbol unique because 
each TU ends up as a different device binary.



Comment at: clang/lib/CodeGen/CodeGenModule.cpp:2864-2865
+#if 0
+  // We need to decide whether to externalize a static variable after checking
+  // whether it is referenced in host code.
+  if (isa(Global) && getContext().mayExternalizeStaticVar(

tra wrote:
> Is this code needed?
this code is not needed. removed.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D85223/new/

https://reviews.llvm.org/D85223

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D85223: [CUDA][HIP] Support accessing static device variable in host code for -fgpu-rdc

2021-01-20 Thread Artem Belevich via Phabricator via cfe-commits
tra added inline comments.



Comment at: clang/lib/AST/ASTContext.cpp:11446-11447
 bool ASTContext::mayExternalizeStaticVar(const Decl *D) const {
-  return !getLangOpts().GPURelocatableDeviceCode &&
+  return (!getLangOpts().CUID.empty() ||
+  !getLangOpts().GPURelocatableDeviceCode) &&
  ((D->hasAttr() &&

`!(getLangOpts().GPURelocatableDeviceCode && getLangOpts().CUID.empty())`.

Maybe this should be broken down into something easier to read.
```
  // Applies only to -fgpu-rdc or when we were given a CUID
  if (!getLangOpts().GPURelocatableDeviceCode || !getLangOpts().CUID.empty()))
  return false;
  // .. only file-scope static vars...
  auto *VD = dyn_cast(D);
  if (!(VD && VD->isFileVarDecl() && VD->getStorageClass() == SC_Static))
  return false;
  // .. with explicit __device__ or __constant__ attributes.
  return ((D->hasAttr() && 
!D->getAttr()->isImplicit()) ||
  (D->hasAttr() 
&&!D->getAttr()->isImplicit()));
  
```



Comment at: clang/lib/AST/ASTContext.cpp:11446-11447
 bool ASTContext::mayExternalizeStaticVar(const Decl *D) const {
-  return !getLangOpts().GPURelocatableDeviceCode &&
+  return (!getLangOpts().CUID.empty() ||
+  !getLangOpts().GPURelocatableDeviceCode) &&
  ((D->hasAttr() &&

tra wrote:
> `!(getLangOpts().GPURelocatableDeviceCode && getLangOpts().CUID.empty())`.
> 
> Maybe this should be broken down into something easier to read.
> ```
>   // Applies only to -fgpu-rdc or when we were given a CUID
>   if (!getLangOpts().GPURelocatableDeviceCode || !getLangOpts().CUID.empty()))
>   return false;
>   // .. only file-scope static vars...
>   auto *VD = dyn_cast(D);
>   if (!(VD && VD->isFileVarDecl() && VD->getStorageClass() == SC_Static))
>   return false;
>   // .. with explicit __device__ or __constant__ attributes.
>   return ((D->hasAttr() && 
> !D->getAttr()->isImplicit()) ||
>   (D->hasAttr() 
> &&!D->getAttr()->isImplicit()));
>   
> ```
BTW, does this mean that we'll externalize & uniquify the vars even w/o 
`-fgpu-rdc` if CUID is given?

IMO `-fgpu-rdc` should remain the flag to control whether externalization is 
needed.
CUID controls the value of a unique suffix, if we need it, but should not 
automatically enable externalization.





Comment at: clang/lib/CodeGen/CodeGenModule.cpp:2864-2865
+#if 0
+  // We need to decide whether to externalize a static variable after checking
+  // whether it is referenced in host code.
+  if (isa(Global) && getContext().mayExternalizeStaticVar(

Is this code needed?


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D85223/new/

https://reviews.llvm.org/D85223

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D85223: [CUDA][HIP] Support accessing static device variable in host code for -fgpu-rdc

2021-01-19 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl updated this revision to Diff 317712.
yaxunl edited the summary of this revision.
yaxunl added a comment.

separate CUID patch.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D85223/new/

https://reviews.llvm.org/D85223

Files:
  clang/lib/AST/ASTContext.cpp
  clang/lib/CodeGen/CGCUDANV.cpp
  clang/lib/CodeGen/CodeGenModule.cpp
  clang/lib/CodeGen/CodeGenModule.h
  clang/test/CodeGenCUDA/static-device-var-rdc.cu
  clang/test/SemaCUDA/static-device-var.cu

Index: clang/test/SemaCUDA/static-device-var.cu
===
--- /dev/null
+++ clang/test/SemaCUDA/static-device-var.cu
@@ -0,0 +1,37 @@
+// REQUIRES: x86-registered-target
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang_cc1 -triple nvptx -fcuda-is-device \
+// RUN:-emit-llvm -o - %s -fsyntax-only -verify
+
+// RUN: %clang_cc1 -triple x86_64-gnu-linux \
+// RUN:-emit-llvm -o - %s -fsyntax-only -verify
+
+// expected-no-diagnostics
+
+#include "Inputs/cuda.h"
+
+__device__ void f1() {
+  const static int b = 123;
+  static int a;
+}
+
+__global__ void k1() {
+  const static int b = 123;
+  static int a;
+}
+
+static __device__ int x;
+static __constant__ int y;
+
+__global__ void kernel(int *a) {
+  a[0] = x;
+  a[1] = y;
+}
+
+int* getDeviceSymbol(int *x);
+
+void foo() {
+  getDeviceSymbol(&x);
+  getDeviceSymbol(&y);
+}
Index: clang/test/CodeGenCUDA/static-device-var-rdc.cu
===
--- /dev/null
+++ clang/test/CodeGenCUDA/static-device-var-rdc.cu
@@ -0,0 +1,89 @@
+// REQUIRES: x86-registered-target
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device \
+// RUN:   -fgpu-rdc -emit-llvm -o - -x hip %s | FileCheck \
+// RUN:   -check-prefixes=DEV,INT-DEV %s
+
+// RUN: %clang_cc1 -triple x86_64-gnu-linux \
+// RUN:   -fgpu-rdc -emit-llvm -o - -x hip %s | FileCheck \
+// RUN:   -check-prefixes=HOST,INT-HOST %s
+
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -cuid=123abc \
+// RUN:   -fgpu-rdc -emit-llvm -o - -x hip %s | FileCheck \
+// RUN:   -check-prefixes=DEV,EXT-DEV %s
+
+// RUN: %clang_cc1 -triple x86_64-gnu-linux -cuid=123abc \
+// RUN:   -fgpu-rdc -emit-llvm -o - -x hip %s | FileCheck \
+// RUN:   -check-prefixes=HOST,EXT-HOST %s
+
+#include "Inputs/cuda.h"
+
+// Test function scope static device variable, which should not be externalized.
+// DEV-DAG: @_ZZ6kernelPiPPKiE1w = internal addrspace(4) constant i32 1
+
+
+// HOST-DAG: @_ZL1x = internal global i32 undef
+// HOST-DAG: @_ZL1y = internal global i32 undef
+
+// Test normal static device variables
+// INT-DEV-DAG: @_ZL1x = internal addrspace(1) global i32 0
+// INT-HOST-DAG: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x\00"
+
+// Test externalized static device variables
+// EXT-DEV-DAG: @_ZL1x.static.123abc = {{.*}}addrspace(1) externally_initialized global i32 0
+// EXT-HOST-DAG: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x.static.123abc\00"
+
+static __device__ int x;
+
+// Test static device variables not used by host code should not be externalized
+// DEV-DAG: @_ZL2x2 = internal addrspace(1) global i32 0
+
+static __device__ int x2;
+
+// Test normal static device variables
+// INT-DEV-DAG: @_ZL1y = internal addrspace(4) global i32 0
+// INT-HOST-DAG: @[[DEVNAMEY:[0-9]+]] = {{.*}}c"_ZL1y\00"
+
+// Test externalized static device variables
+// EXT-DEV-DAG: @_ZL1y.static.123abc = {{.*}}addrspace(4) externally_initialized global i32 0
+// EXT-HOST-DAG: @[[DEVNAMEY:[0-9]+]] = {{.*}}c"_ZL1y.static.123abc\00"
+
+static __constant__ int y;
+
+// Test static host variable, which should not be externalized nor registered.
+// HOST-DAG: @_ZL1z = internal global i32 0
+// DEV-NOT: @_ZL1z
+static int z;
+
+// Test static device variable in inline function, which should not be
+// externalized nor registered.
+// DEV-DAG: @_ZZ6devfunPPKiE1p = linkonce_odr addrspace(4) constant i32 2, comdat
+
+inline __device__ void devfun(const int ** b) {
+  const static int p = 2;
+  b[0] = &p;
+}
+
+__global__ void kernel(int *a, const int **b) {
+  const static int w = 1;
+  a[0] = x;
+  a[1] = y;
+  b[0] = &w;
+  b[1] = &x2;
+  devfun(b);
+}
+
+int* getDeviceSymbol(int *x);
+
+void foo() {
+  getDeviceSymbol(&x);
+  getDeviceSymbol(&y);
+  z = 123;
+}
+
+// HOST: __hipRegisterVar({{.*}}@_ZL1x {{.*}}@[[DEVNAMEX]]
+// HOST: __hipRegisterVar({{.*}}@_ZL1y {{.*}}@[[DEVNAMEY]]
+// HOST-NOT: __hipRegisterVar({{.*}}@_ZL2x2
+// HOST-NOT: __hipRegisterVar({{.*}}@_ZZ6kernelPiPPKiE1w
+// HOST-NOT: __hipRegisterVar({{.*}}@_ZZ6devfunPPKiE1p
Index: clang/lib/CodeGen/CodeGenModule.h
===
--- clang/lib/CodeGen/CodeGenModule.h
+++ clang/lib/CodeGen/CodeGenModule.h
@@ -1416,6 +1416,10 @@
TBAAAccessInfo *TBAAInfo = nullptr);
   bool stopAutoInit();
 
+  /// Print the postfix for externalized static variable for single source
+  /// off

[PATCH] D85223: [CUDA][HIP] Support accessing static device variable in host code for -fgpu-rdc

2021-01-19 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl added a comment.

In D85223#2507518 , @tra wrote:

> I'd propose splitting the patch into two. One with the addition of CUID and 
> the other that changes the way we havdle static vars.
> CUID is useful on its own and is relatively uncontroversial.
>
> Externalizing static vars is a more interesting issue and I'm not sure what's 
> the best way to handle it yet. On one hand it is necessary for visibility 
> across host/device, on the other, externalizing all static vars will almost 
> always have negative effect as very few of the static vars actually need 
> this. As already pointed out in the `#if 0` section of the patch, ideally we 
> should externalize only the vars that need it. Generally speaking, I do not 
> think we will be able to do that, because with `-fgpu-rdc` it may be used 
> from the host code in some other TU.
>
> We may need to explicitly annotate such the static variables that need to be 
> visible on both sides and only apply externalization to the variables 
> annotated this way. E.g. require them to be `__host__ __device__`.
>
> WDYT?

Agree that CUID may be useful for other situations. Will separate it to another 
review.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D85223/new/

https://reviews.llvm.org/D85223

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D85223: [CUDA][HIP] Support accessing static device variable in host code for -fgpu-rdc

2021-01-19 Thread Artem Belevich via Phabricator via cfe-commits
tra added a comment.

I'd propose splitting the patch into two. One with the addition of CUID and the 
other that changes the way we havdle static vars.
CUID is useful on its own and is relatively uncontroversial.

Externalizing static vars is a more interesting issue and I'm not sure what's 
the best way to handle it yet. On one hand it is necessary for visibility 
across host/device, on the other, externalizing all static vars will almost 
always have negative effect as very few of the static vars actually need this. 
As already pointed out in the `#if 0` section of the patch, ideally we should 
externalize only the vars that need it. Generally speaking, I do not think we 
will be able to do that, because with `-fgpu-rdc` it may be used from the host 
code in some other TU.

We may need to explicitly annotate such the static variables that need to be 
visible on both sides and only apply externalization to the variables annotated 
this way. E.g. require them to be `__host__ __device__`.

WDYT?


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D85223/new/

https://reviews.llvm.org/D85223

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D85223: [CUDA][HIP] Support accessing static device variable in host code for -fgpu-rdc

2021-01-15 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl added a comment.

In D85223#2452363 , @JonChesterfield 
wrote:

> I concede that making the variables external, and trying to give them unique 
> names, does work around static variables not working. I believe static 
> variables are subjected to more aggressive optimisation than external ones 
> but the effect might not be significant.
>
> This "works" in cuda today because the loader ignores the local annotation 
> when accessing the variable. There is some probably unintended behaviour when 
> multiple static variables have the same name in that the first one wins.
>
> The corresponding change to the hsa loader is trivial. Why is making the 
> symbols external, with the associated complexity in picking non-conflicting 
> names, considered better than changing the loader?

Three reasons:

1. The loader would like to look up dynsym only, which conforms better to the 
standard dynamic linker behavior and is more efficient than looking up all 
symbols.

2. There could be symbols with the same name from different compilation units 
and they end up as local symbols with the same name in the binary. How does the 
loader know which is which.

3. If a device symbol is static but actually accessed by the host code in the 
same compilation unit, the device symbol has de facto external linkage since it 
is truly accessed by some one out side of the device object (this is due to the 
unfortunate fact that a single source file ends up with a host object and a 
device object even though they are supposed to be the same compilation unit).  
Keeping the device symbol with internal linkage will cause the compiler over 
optimize the device code.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D85223/new/

https://reviews.llvm.org/D85223

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D85223: [CUDA][HIP] Support accessing static device variable in host code for -fgpu-rdc

2020-12-14 Thread Jon Chesterfield via Phabricator via cfe-commits
JonChesterfield added reviewers: jdoerfert, hfinkel.
JonChesterfield added a comment.
Herald added a subscriber: dexonsmith.

I concede that making the variables external, and trying to give them unique 
names, does work around static variables not working. I believe static 
variables are subjected to more aggressive optimisation than external ones but 
the effect might not be significant.

This "works" in cuda today because the loader ignores the local annotation when 
accessing the variable. There is some probably unintended behaviour when 
multiple static variables have the same name in that the first one wins.

The corresponding change to the hsa loader is trivial. Why is making the 
symbols external, with the associated complexity in picking non-conflicting 
names, considered better than changing the loader?


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D85223/new/

https://reviews.llvm.org/D85223

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D85223: [CUDA][HIP] Support accessing static device variable in host code for -fgpu-rdc

2020-08-04 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl created this revision.
yaxunl added reviewers: tra, rjmccall, JonChesterfield, hliao.
Herald added a subscriber: dang.
yaxunl requested review of this revision.

This is separated from https://reviews.llvm.org/D80858

For -fgpu-rdc mode, static device vars in different TU's may have the same name.
To support accessing file-scope static device variables in host code, we need 
to give them
a distinct name and external linkage. This can be done by postfixing each 
static device variable with
a distinct CUID (Compilation Unit ID). Also we have to make sure the host 
compilation and device
compilation of the same compilation unit use identical CUID.

This patch added a distinct CUID for each input file, which is represented by 
InputAction.
clang initially creates an InputAction for each input file for the host 
compilation. In CUDA/HIP action
builder, each InputAction is given a CUID and cloned for each GPU arch, and the 
CUID is also cloned. In this way,
we guarantee the corresponding device and host compilation for the same file 
shared the
same CUID, therefore the postfixed device variable and shadow variable share 
the same name.
On the other hand, different compilation units have different CUID, therefore a 
static variable
with the same name but in a different compilation unit will have a different 
name.

Since the static device variables have different name across compilation units, 
now we let
them have external linkage so that they can be looked up by the runtime.

-fuse-cuid=random|hash|none is added to control the method to generate CUID. 
The default
is hash. -cuid=X is also added to specify CUID explicitly, which overrides 
-fuse-cuid.


https://reviews.llvm.org/D85223

Files:
  clang/include/clang/AST/ASTContext.h
  clang/include/clang/Basic/DiagnosticDriverKinds.td
  clang/include/clang/Basic/LangOptions.h
  clang/include/clang/Driver/Action.h
  clang/include/clang/Driver/Compilation.h
  clang/include/clang/Driver/Options.td
  clang/lib/AST/ASTContext.cpp
  clang/lib/CodeGen/CGCUDANV.cpp
  clang/lib/CodeGen/CodeGenModule.cpp
  clang/lib/CodeGen/CodeGenModule.h
  clang/lib/Driver/Action.cpp
  clang/lib/Driver/Driver.cpp
  clang/lib/Driver/ToolChains/Clang.cpp
  clang/lib/Frontend/CompilerInvocation.cpp
  clang/test/CodeGenCUDA/static-device-var-rdc.cu
  clang/test/Driver/hip-cuid.hip
  clang/test/Frontend/hip-cuid.hip
  clang/test/SemaCUDA/static-device-var.cu

Index: clang/test/SemaCUDA/static-device-var.cu
===
--- /dev/null
+++ clang/test/SemaCUDA/static-device-var.cu
@@ -0,0 +1,37 @@
+// REQUIRES: x86-registered-target
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang_cc1 -triple nvptx -fcuda-is-device \
+// RUN:-emit-llvm -o - %s -fsyntax-only -verify
+
+// RUN: %clang_cc1 -triple x86_64-gnu-linux \
+// RUN:-emit-llvm -o - %s -fsyntax-only -verify
+
+#include "Inputs/cuda.h"
+
+__device__ void f1() {
+  const static int b = 123;
+  static int a;
+  // expected-error@-1 {{within a __device__ function, only __shared__ variables or const variables without device memory qualifier may be marked 'static'}}
+}
+
+__global__ void k1() {
+  const static int b = 123;
+  static int a;
+  // expected-error@-1 {{within a __global__ function, only __shared__ variables or const variables without device memory qualifier may be marked 'static'}}
+}
+
+static __device__ int x;
+static __constant__ int y;
+
+__global__ void kernel(int *a) {
+  a[0] = x;
+  a[1] = y;
+}
+
+int* getDeviceSymbol(int *x);
+
+void foo() {
+  getDeviceSymbol(&x);
+  getDeviceSymbol(&y);
+}
Index: clang/test/Frontend/hip-cuid.hip
===
--- /dev/null
+++ clang/test/Frontend/hip-cuid.hip
@@ -0,0 +1,6 @@
+// RUN: not %clang_cc1 -cuid=abc-123 -offload-arch=gfx906 %s 2>&1 \
+// RUN:   | FileCheck --check-prefix=INVALID %s
+
+// INVALID: invalid value 'abc-123' in '-cuid=abc-123' (alphanumeric characters and underscore only)
+
+// RUN: %clang_cc1 -cuid=abc_123 -offload-arch=gfx906 %s
Index: clang/test/Driver/hip-cuid.hip
===
--- /dev/null
+++ clang/test/Driver/hip-cuid.hip
@@ -0,0 +1,130 @@
+// REQUIRES: clang-driver
+// REQUIRES: x86-registered-target
+// REQUIRES: amdgpu-registered-target
+
+// Check invalid -fuse-cuid= option.
+
+// RUN: not %clang -### -x hip \
+// RUN:   -target x86_64-unknown-linux-gnu \
+// RUN:   --offload-arch=gfx900 \
+// RUN:   --offload-arch=gfx906 \
+// RUN:   -c -nogpulib -fuse-cuid=invalid \
+// RUN:   %S/Inputs/hip_multiple_inputs/a.cu \
+// RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
+// RUN: 2>&1 | FileCheck -check-prefixes=INVALID %s
+
+// INVALID: invalid value 'invalid' in '-fuse-cuid=invalid'
+
+// Check random CUID generator.
+
+// RUN: %clang -### -x hip \
+// RUN:   -target x86_64-unknown-linux-gnu \
+// RUN:   --offload-arch=gfx900 \
+// RUN:   --offload-arch=gfx906 \
+// RUN:   -c -nogpu