[PATCH] D155850: [Clang][CodeGen][RFC] Add codegen support for C++ Parallel Algorithm Offload

2023-08-22 Thread Alex Voicu via Phabricator via cfe-commits
AlexVlx updated this revision to Diff 552575.
AlexVlx edited the summary of this revision.
AlexVlx added a comment.

Updating to reflect the outcome of the RFC, which is that this will be added as 
a HIP extension exclusively.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D155850/new/

https://reviews.llvm.org/D155850

Files:
  clang/lib/CodeGen/BackendUtil.cpp
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/lib/CodeGen/CGStmt.cpp
  clang/lib/CodeGen/CodeGenFunction.cpp
  clang/lib/CodeGen/CodeGenModule.cpp
  clang/test/CodeGenHipStdPar/unannotated-functions-get-emitted.cpp
  clang/test/CodeGenHipStdPar/unsupported-ASM.cpp
  clang/test/CodeGenHipStdPar/unsupported-builtins.cpp

Index: clang/test/CodeGenHipStdPar/unsupported-builtins.cpp
===
--- /dev/null
+++ clang/test/CodeGenHipStdPar/unsupported-builtins.cpp
@@ -0,0 +1,8 @@
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu \
+// RUN:   --hipstdpar -x hip -emit-llvm -fcuda-is-device -o - %s | FileCheck %s
+
+#define __global__ __attribute__((global))
+
+__global__ void foo() { return __builtin_ia32_pause(); }
+
+// CHECK: declare void @__builtin_ia32_pause__hipstdpar_unsupported()
Index: clang/test/CodeGenHipStdPar/unsupported-ASM.cpp
===
--- /dev/null
+++ clang/test/CodeGenHipStdPar/unsupported-ASM.cpp
@@ -0,0 +1,10 @@
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu \
+// RUN:   --hipstdpar -x hip -emit-llvm -fcuda-is-device -o - %s | FileCheck %s
+
+#define __global__ __attribute__((global))
+
+__global__ void foo(int i) {
+asm ("addl %2, %1; seto %b0" : "=q" (i), "+g" (i) : "r" (i));
+}
+
+// CHECK: declare void @__ASM__hipstdpar_unsupported([{{.*}}])
Index: clang/test/CodeGenHipStdPar/unannotated-functions-get-emitted.cpp
===
--- /dev/null
+++ clang/test/CodeGenHipStdPar/unannotated-functions-get-emitted.cpp
@@ -0,0 +1,19 @@
+// RUN: %clang_cc1 -x hip -emit-llvm -fcuda-is-device \
+// RUN:   -o - %s | FileCheck --check-prefix=NO-HIPSTDPAR-DEV %s
+
+// RUN: %clang_cc1 --hipstdpar -emit-llvm -fcuda-is-device \
+// RUN:   -o - %s | FileCheck --check-prefix=HIPSTDPAR-DEV %s
+
+#define __device__ __attribute__((device))
+
+// NO-HIPSTDPAR-DEV-NOT: define {{.*}} void @_Z3fooPff({{.*}})
+// HIPSTDPAR-DEV: define {{.*}} void @_Z3fooPff({{.*}})
+void foo(float *a, float b) {
+  *a = b;
+}
+
+// NO-HIPSTDPAR-DEV: define {{.*}} void @_Z3barPff({{.*}})
+// HIPSTDPAR-DEV: define {{.*}} void @_Z3barPff({{.*}})
+__device__ void bar(float *a, float b) {
+  *a = b;
+}
Index: clang/lib/CodeGen/CodeGenModule.cpp
===
--- clang/lib/CodeGen/CodeGenModule.cpp
+++ clang/lib/CodeGen/CodeGenModule.cpp
@@ -3558,7 +3558,10 @@
   !Global->hasAttr() &&
   !Global->hasAttr() &&
   !Global->getType()->isCUDADeviceBuiltinSurfaceType() &&
-  !Global->getType()->isCUDADeviceBuiltinTextureType())
+  !Global->getType()->isCUDADeviceBuiltinTextureType() &&
+  !(LangOpts.HIPStdPar &&
+isa(Global) &&
+!Global->hasAttr()))
 return;
 } else {
   // We need to emit host-side 'shadows' for all global
Index: clang/lib/CodeGen/CodeGenFunction.cpp
===
--- clang/lib/CodeGen/CodeGenFunction.cpp
+++ clang/lib/CodeGen/CodeGenFunction.cpp
@@ -2594,10 +2594,15 @@
   std::string MissingFeature;
   llvm::StringMap CallerFeatureMap;
   CGM.getContext().getFunctionFeatureMap(CallerFeatureMap, FD);
+  // When compiling in HipStdPar mode we have to be conservative in rejecting
+  // target specific features in the FE, and defer the possible error to the
+  // AcceleratorCodeSelection pass, wherein iff an unsupported target builtin is
+  // referenced by an accelerator executable function, we emit an error.
+  bool IsHipStdPar = getLangOpts().HIPStdPar && getLangOpts().CUDAIsDevice
   if (BuiltinID) {
 StringRef FeatureList(CGM.getContext().BuiltinInfo.getRequiredFeatures(BuiltinID));
 if (!Builtin::evaluateRequiredTargetFeatures(
-FeatureList, CallerFeatureMap)) {
+FeatureList, CallerFeatureMap) && !IsHipStdPar) {
   CGM.getDiags().Report(Loc, diag::err_builtin_needs_feature)
   << TargetDecl->getDeclName()
   << FeatureList;
@@ -2630,7 +2635,7 @@
 return false;
   }
   return true;
-}))
+}) && !IsHipStdPar)
   CGM.getDiags().Report(Loc, diag::err_function_needs_feature)
   << FD->getDeclName() << TargetDecl->getDeclName() << MissingFeature;
   } else if (!FD->isMultiVersion() && FD->hasAttr()) {
@@ -2639,7 +2644,8 @@
 
 for (const auto &F : CalleeFeatureMap) {
   if (F.getValue() && (!CallerFeatureMap.lookup(F.getKey()) ||
-

[PATCH] D155850: [Clang][CodeGen][RFC] Add codegen support for C++ Parallel Algorithm Offload

2023-08-10 Thread Alex Voicu via Phabricator via cfe-commits
AlexVlx updated this revision to Diff 549159.
AlexVlx added a comment.

Switch to `__ASM` prefix.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D155850/new/

https://reviews.llvm.org/D155850

Files:
  clang/lib/CodeGen/BackendUtil.cpp
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/lib/CodeGen/CGStmt.cpp
  clang/lib/CodeGen/CodeGenFunction.cpp
  clang/lib/CodeGen/CodeGenModule.cpp
  clang/test/CodeGenStdPar/unannotated-functions-get-emitted.cpp
  clang/test/CodeGenStdPar/unsupported-ASM.cpp
  clang/test/CodeGenStdPar/unsupported-builtins.cpp

Index: clang/test/CodeGenStdPar/unsupported-builtins.cpp
===
--- /dev/null
+++ clang/test/CodeGenStdPar/unsupported-builtins.cpp
@@ -0,0 +1,8 @@
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu \
+// RUN:   --stdpar -x hip -emit-llvm -fcuda-is-device -o - %s | FileCheck %s
+
+#define __global__ __attribute__((global))
+
+__global__ void foo() { return __builtin_ia32_pause(); }
+
+// CHECK: declare void @__builtin_ia32_pause__stdpar_unsupported()
Index: clang/test/CodeGenStdPar/unsupported-ASM.cpp
===
--- /dev/null
+++ clang/test/CodeGenStdPar/unsupported-ASM.cpp
@@ -0,0 +1,10 @@
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu \
+// RUN:   --stdpar -x hip -emit-llvm -fcuda-is-device -o - %s | FileCheck %s
+
+#define __global__ __attribute__((global))
+
+__global__ void foo(int i) {
+asm ("addl %2, %1; seto %b0" : "=q" (i), "+g" (i) : "r" (i));
+}
+
+// CHECK: declare void @__ASM__stdpar_unsupported([{{.*}}])
Index: clang/test/CodeGenStdPar/unannotated-functions-get-emitted.cpp
===
--- /dev/null
+++ clang/test/CodeGenStdPar/unannotated-functions-get-emitted.cpp
@@ -0,0 +1,19 @@
+// RUN: %clang_cc1 -x hip -emit-llvm -fcuda-is-device \
+// RUN:   -o - %s | FileCheck --check-prefix=NO-STDPAR-DEV %s
+
+// RUN: %clang_cc1 --stdpar -emit-llvm -fcuda-is-device \
+// RUN:   -o - %s | FileCheck --check-prefix=STDPAR-DEV %s
+
+#define __device__ __attribute__((device))
+
+// NO-STDPAR-DEV-NOT: define {{.*}} void @_Z3fooPff({{.*}})
+// STDPAR-DEV: define {{.*}} void @_Z3fooPff({{.*}})
+void foo(float *a, float b) {
+  *a = b;
+}
+
+// NO-STDPAR-DEV: define {{.*}} void @_Z3barPff({{.*}})
+// STDPAR-DEV: define {{.*}} void @_Z3barPff({{.*}})
+__device__ void bar(float *a, float b) {
+  *a = b;
+}
Index: clang/lib/CodeGen/CodeGenModule.cpp
===
--- clang/lib/CodeGen/CodeGenModule.cpp
+++ clang/lib/CodeGen/CodeGenModule.cpp
@@ -3558,7 +3558,10 @@
   !Global->hasAttr() &&
   !Global->hasAttr() &&
   !Global->getType()->isCUDADeviceBuiltinSurfaceType() &&
-  !Global->getType()->isCUDADeviceBuiltinTextureType())
+  !Global->getType()->isCUDADeviceBuiltinTextureType() &&
+  !(LangOpts.HIPStdPar &&
+isa(Global) &&
+!Global->hasAttr()))
 return;
 } else {
   // We need to emit host-side 'shadows' for all global
Index: clang/lib/CodeGen/CodeGenFunction.cpp
===
--- clang/lib/CodeGen/CodeGenFunction.cpp
+++ clang/lib/CodeGen/CodeGenFunction.cpp
@@ -2594,10 +2594,15 @@
   std::string MissingFeature;
   llvm::StringMap CallerFeatureMap;
   CGM.getContext().getFunctionFeatureMap(CallerFeatureMap, FD);
+  // When compiling in StdPar mode we have to be conservative in rejecting
+  // target specific features in the FE, and defer the possible error to the
+  // AcceleratorCodeSelection pass, wherein iff an unsupported target builtin is
+  // referenced by an accelerator executable function, we emit an error.
+  bool IsStdPar = getLangOpts().HIPStdPar && getLangOpts().CUDAIsDevice
   if (BuiltinID) {
 StringRef FeatureList(CGM.getContext().BuiltinInfo.getRequiredFeatures(BuiltinID));
 if (!Builtin::evaluateRequiredTargetFeatures(
-FeatureList, CallerFeatureMap)) {
+FeatureList, CallerFeatureMap) && !IsStdPar) {
   CGM.getDiags().Report(Loc, diag::err_builtin_needs_feature)
   << TargetDecl->getDeclName()
   << FeatureList;
@@ -2630,7 +2635,7 @@
 return false;
   }
   return true;
-}))
+}) && !IsStdPar)
   CGM.getDiags().Report(Loc, diag::err_function_needs_feature)
   << FD->getDeclName() << TargetDecl->getDeclName() << MissingFeature;
   } else if (!FD->isMultiVersion() && FD->hasAttr()) {
@@ -2639,7 +2644,8 @@
 
 for (const auto &F : CalleeFeatureMap) {
   if (F.getValue() && (!CallerFeatureMap.lookup(F.getKey()) ||
-   !CallerFeatureMap.find(F.getKey())->getValue()))
+   !CallerFeatureMap.find(F.getKey())->getValue()) &&
+  !IsStdPar)
 CGM.getDiags().Report(Loc, diag::err_fu

[PATCH] D155850: [Clang][CodeGen][RFC] Add codegen support for C++ Parallel Algorithm Offload

2023-08-10 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl added inline comments.



Comment at: clang/lib/CodeGen/CGStmt.cpp:2422
+static void EmitStdParUnsupportedAsm(CodeGenFunction *CGF, const AsmStmt &S) {
+  constexpr auto Name = "ASM__stdpar_unsupported";
+

maybe prefix with `__` to avoid potential name collision with users' code


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D155850/new/

https://reviews.llvm.org/D155850

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D155850: [Clang][CodeGen][RFC] Add codegen support for C++ Parallel Algorithm Offload

2023-08-10 Thread Alex Voicu via Phabricator via cfe-commits
AlexVlx updated this revision to Diff 549101.
AlexVlx added a comment.

Fix typo.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D155850/new/

https://reviews.llvm.org/D155850

Files:
  clang/lib/CodeGen/BackendUtil.cpp
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/lib/CodeGen/CGStmt.cpp
  clang/lib/CodeGen/CodeGenFunction.cpp
  clang/lib/CodeGen/CodeGenModule.cpp
  clang/test/CodeGenStdPar/unannotated-functions-get-emitted.cpp
  clang/test/CodeGenStdPar/unsupported-ASM.cpp
  clang/test/CodeGenStdPar/unsupported-builtins.cpp

Index: clang/test/CodeGenStdPar/unsupported-builtins.cpp
===
--- /dev/null
+++ clang/test/CodeGenStdPar/unsupported-builtins.cpp
@@ -0,0 +1,8 @@
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu \
+// RUN:   --stdpar -x hip -emit-llvm -fcuda-is-device -o - %s | FileCheck %s
+
+#define __global__ __attribute__((global))
+
+__global__ void foo() { return __builtin_ia32_pause(); }
+
+// CHECK: declare void @__builtin_ia32_pause__stdpar_unsupported()
Index: clang/test/CodeGenStdPar/unsupported-ASM.cpp
===
--- /dev/null
+++ clang/test/CodeGenStdPar/unsupported-ASM.cpp
@@ -0,0 +1,10 @@
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu \
+// RUN:   --stdpar -x hip -emit-llvm -fcuda-is-device -o - %s | FileCheck %s
+
+#define __global__ __attribute__((global))
+
+__global__ void foo(int i) {
+asm ("addl %2, %1; seto %b0" : "=q" (i), "+g" (i) : "r" (i));
+}
+
+// CHECK: declare void @ASM__stdpar_unsupported([{{.*}}])
Index: clang/test/CodeGenStdPar/unannotated-functions-get-emitted.cpp
===
--- /dev/null
+++ clang/test/CodeGenStdPar/unannotated-functions-get-emitted.cpp
@@ -0,0 +1,19 @@
+// RUN: %clang_cc1 -x hip -emit-llvm -fcuda-is-device \
+// RUN:   -o - %s | FileCheck --check-prefix=NO-STDPAR-DEV %s
+
+// RUN: %clang_cc1 --stdpar -emit-llvm -fcuda-is-device \
+// RUN:   -o - %s | FileCheck --check-prefix=STDPAR-DEV %s
+
+#define __device__ __attribute__((device))
+
+// NO-STDPAR-DEV-NOT: define {{.*}} void @_Z3fooPff({{.*}})
+// STDPAR-DEV: define {{.*}} void @_Z3fooPff({{.*}})
+void foo(float *a, float b) {
+  *a = b;
+}
+
+// NO-STDPAR-DEV: define {{.*}} void @_Z3barPff({{.*}})
+// STDPAR-DEV: define {{.*}} void @_Z3barPff({{.*}})
+__device__ void bar(float *a, float b) {
+  *a = b;
+}
Index: clang/lib/CodeGen/CodeGenModule.cpp
===
--- clang/lib/CodeGen/CodeGenModule.cpp
+++ clang/lib/CodeGen/CodeGenModule.cpp
@@ -3558,7 +3558,10 @@
   !Global->hasAttr() &&
   !Global->hasAttr() &&
   !Global->getType()->isCUDADeviceBuiltinSurfaceType() &&
-  !Global->getType()->isCUDADeviceBuiltinTextureType())
+  !Global->getType()->isCUDADeviceBuiltinTextureType() &&
+  !(LangOpts.HIPStdPar &&
+isa(Global) &&
+!Global->hasAttr()))
 return;
 } else {
   // We need to emit host-side 'shadows' for all global
Index: clang/lib/CodeGen/CodeGenFunction.cpp
===
--- clang/lib/CodeGen/CodeGenFunction.cpp
+++ clang/lib/CodeGen/CodeGenFunction.cpp
@@ -2594,10 +2594,15 @@
   std::string MissingFeature;
   llvm::StringMap CallerFeatureMap;
   CGM.getContext().getFunctionFeatureMap(CallerFeatureMap, FD);
+  // When compiling in StdPar mode we have to be conservative in rejecting
+  // target specific features in the FE, and defer the possible error to the
+  // AcceleratorCodeSelection pass, wherein iff an unsupported target builtin is
+  // referenced by an accelerator executable function, we emit an error.
+  bool IsStdPar = getLangOpts().HIPStdPar && getLangOpts().CUDAIsDevice
   if (BuiltinID) {
 StringRef FeatureList(CGM.getContext().BuiltinInfo.getRequiredFeatures(BuiltinID));
 if (!Builtin::evaluateRequiredTargetFeatures(
-FeatureList, CallerFeatureMap)) {
+FeatureList, CallerFeatureMap) && !IsStdPar) {
   CGM.getDiags().Report(Loc, diag::err_builtin_needs_feature)
   << TargetDecl->getDeclName()
   << FeatureList;
@@ -2630,7 +2635,7 @@
 return false;
   }
   return true;
-}))
+}) && !IsStdPar)
   CGM.getDiags().Report(Loc, diag::err_function_needs_feature)
   << FD->getDeclName() << TargetDecl->getDeclName() << MissingFeature;
   } else if (!FD->isMultiVersion() && FD->hasAttr()) {
@@ -2639,7 +2644,8 @@
 
 for (const auto &F : CalleeFeatureMap) {
   if (F.getValue() && (!CallerFeatureMap.lookup(F.getKey()) ||
-   !CallerFeatureMap.find(F.getKey())->getValue()))
+   !CallerFeatureMap.find(F.getKey())->getValue()) &&
+  !IsStdPar)
 CGM.getDiags().Report(Loc, diag::err_function_needs_featu

[PATCH] D155850: [Clang][CodeGen][RFC] Add codegen support for C++ Parallel Algorithm Offload

2023-08-10 Thread Alex Voicu via Phabricator via cfe-commits
AlexVlx updated this revision to Diff 549097.
AlexVlx added a comment.

Add support for handling certain cases of unambiguously accelerator unsupported 
ASM i.e. cases where constraints are clearly mismatched. When that happens, we 
instead emit an `ASM__stdpar_unsupported` stub which takes as its single 
argument the `constexpr` string value of the ASM block. Later, in the 
AcceleratorCodeSelection pass, if such a stub is reachable from an accelerator 
callable, we error out and print the offending ASM alongside the location.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D155850/new/

https://reviews.llvm.org/D155850

Files:
  clang/lib/CodeGen/BackendUtil.cpp
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/lib/CodeGen/CGStmt.cpp
  clang/lib/CodeGen/CodeGenFunction.cpp
  clang/lib/CodeGen/CodeGenModule.cpp
  clang/test/CodeGenStdPar/unannotated-functions-get-emitted.cpp
  clang/test/CodeGenStdPar/unsupported-ASM.cpp
  clang/test/CodeGenStdPar/unsupported-builtins.cpp

Index: clang/test/CodeGenStdPar/unsupported-builtins.cpp
===
--- /dev/null
+++ clang/test/CodeGenStdPar/unsupported-builtins.cpp
@@ -0,0 +1,8 @@
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu \
+// RUN:   --stdpar -x hip -emit-llvm -fcuda-is-device -o - %s | FileCheck %s
+
+#define __global__ __attribute__((global))
+
+__global__ void foo() { return __builtin_ia32_pause(); }
+
+// CHECK: declare void @__builtin_ia32_pause__stdpar_unsupported()
Index: clang/test/CodeGenStdPar/unsupported-ASM.cpp
===
--- /dev/null
+++ clang/test/CodeGenStdPar/unsupported-ASM.cpp
@@ -0,0 +1,10 @@
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu \
+// RUN:   --stdpar -x hip -emit-llvm -fcuda-is-device -o - %s | FileCheck %s
+
+#define __global__ __attribute__((global))
+
+__global__ void foo(int i) {
+asm ("addl %2, %1; seto %b0" : "=q" (i), "+g" (i) : "r" (i));
+}
+
+// CHECK: declare void @ASM__stdpar_unsupported([{{.*}}])
Index: clang/test/CodeGenStdPar/unannotated-functions-get-emitted.cpp
===
--- /dev/null
+++ clang/test/CodeGenStdPar/unannotated-functions-get-emitted.cpp
@@ -0,0 +1,19 @@
+// RUN: %clang_cc1 -x hip -emit-llvm -fcuda-is-device \
+// RUN:   -o - %s | FileCheck --check-prefix=NO-STDPAR-DEV %s
+
+// RUN: %clang_cc1 --stdpar -emit-llvm -fcuda-is-device \
+// RUN:   -o - %s | FileCheck --check-prefix=STDPAR-DEV %s
+
+#define __device__ __attribute__((device))
+
+// NO-STDPAR-DEV-NOT: define {{.*}} void @_Z3fooPff({{.*}})
+// STDPAR-DEV: define {{.*}} void @_Z3fooPff({{.*}})
+void foo(float *a, float b) {
+  *a = b;
+}
+
+// NO-STDPAR-DEV: define {{.*}} void @_Z3barPff({{.*}})
+// STDPAR-DEV: define {{.*}} void @_Z3barPff({{.*}})
+__device__ void bar(float *a, float b) {
+  *a = b;
+}
Index: clang/lib/CodeGen/CodeGenModule.cpp
===
--- clang/lib/CodeGen/CodeGenModule.cpp
+++ clang/lib/CodeGen/CodeGenModule.cpp
@@ -3558,7 +3558,10 @@
   !Global->hasAttr() &&
   !Global->hasAttr() &&
   !Global->getType()->isCUDADeviceBuiltinSurfaceType() &&
-  !Global->getType()->isCUDADeviceBuiltinTextureType())
+  !Global->getType()->isCUDADeviceBuiltinTextureType() &&
+  !(LangOpts.HIPStdPar &&
+isa(Global) &&
+!Global->hasAttr()))
 return;
 } else {
   // We need to emit host-side 'shadows' for all global
Index: clang/lib/CodeGen/CodeGenFunction.cpp
===
--- clang/lib/CodeGen/CodeGenFunction.cpp
+++ clang/lib/CodeGen/CodeGenFunction.cpp
@@ -2594,10 +2594,15 @@
   std::string MissingFeature;
   llvm::StringMap CallerFeatureMap;
   CGM.getContext().getFunctionFeatureMap(CallerFeatureMap, FD);
+  // When compiling in StdPar mode we have to be conservative in rejecting
+  // target specific features in the FE, and defer the possible error to the
+  // AcceleratorCodeSelection pass, wherein iff an unsupported target builtin is
+  // referenced by an accelerator executable function, we emit an error.
+  bool IsStdPar = getLangOpts().HIPStdPar && getLangOpts().CUDAIsDevice
   if (BuiltinID) {
 StringRef FeatureList(CGM.getContext().BuiltinInfo.getRequiredFeatures(BuiltinID));
 if (!Builtin::evaluateRequiredTargetFeatures(
-FeatureList, CallerFeatureMap)) {
+FeatureList, CallerFeatureMap) && !IsStdPar) {
   CGM.getDiags().Report(Loc, diag::err_builtin_needs_feature)
   << TargetDecl->getDeclName()
   << FeatureList;
@@ -2630,7 +2635,7 @@
 return false;
   }
   return true;
-}))
+}) && !IsStdPar)
   CGM.getDiags().Report(Loc, diag::err_function_needs_feature)
   << FD->getDeclName() << TargetDecl->getDeclName() << Mis

[PATCH] D155850: [Clang][CodeGen][RFC] Add codegen support for C++ Parallel Algorithm Offload

2023-08-08 Thread Ronan Keryell via Phabricator via cfe-commits
keryell added inline comments.



Comment at: clang/lib/CodeGen/CGBuiltin.cpp:5542
+  if (!getLangOpts().HIPStdPar)
+ErrorUnsupported(E, "builtin function");
 

AlexVlx wrote:
> efriedma wrote:
> > AlexVlx wrote:
> > > efriedma wrote:
> > > > This doesn't make sense; we can't just ignore bits of the source code.  
> > > > I guess this is related to "the decision on their validity is 
> > > > deferred", but I don't see how you expect this to work.
> > > This is one of the weirder parts, so let's consider the following example:
> > > 
> > > ```cpp
> > > void foo() { __builtin_ia32_pause(); }
> > > void bar() { __builtin_trap(); }
> > > 
> > > void baz(const vector& v) {
> > > return for_each(par_unseq, cbegin(v), cend(v), [](auto&& x) { if (x 
> > > == 42) bar(); });
> > > }
> > > ```
> > > 
> > > In the case above, what we'd offload to the accelerator, and ask the 
> > > target BE to lower, is the implementation of `for_each`, and `bar`, 
> > > because it is reachable from the latter. `foo` is not reachable by any 
> > > execution path on the accelerator side, however it includes a builtin 
> > > that is unsupported by the accelerator (unless said accelerator is x86, 
> > > which is not impossible, but not something we're dealing with at the 
> > > moment). If we were to actually error out early, in the FE, in these 
> > > cases, there's almost no appeal to what is being proposed, because 
> > > standard headers, as well as other libraries, are littered with various 
> > > target specific builtins that are not going to be supported. This all 
> > > builds on the core invariant of this model / extension / thingamabob, 
> > > which is that the algorithms, and only the algorithms, are targets for 
> > > offload. It thus follows that as long as code that is reachable from an 
> > > algorithm's implementation is safe, all is fine, but we cannot know this 
> > > in the FE / on an AST level, because we need the actual CFG. This part is 
> > > handled in LLVM in the `SelectAcceleratorCodePass` that's in the last 
> > > patch in this series.
> > > 
> > > Now, you will quite correctly observe that there's nothing preventing an 
> > > user from calling `foo` in the callable they pass to an algorithm; they 
> > > might read the docs / appreciate that this won't work, but even there 
> > > they are not safe, because there via some opaque call chain they might 
> > > end up touching some unsupported builtin. My intuition here, which is 
> > > reflected above in letting builtins just flow through, is that such cases 
> > > are better served with a compile time error, which is what will obtain 
> > > once the target BE chokes trying to lower an unsupported builtin. It's 
> > > not going to be a beautiful error, and we could probably prettify it 
> > > somewhat if we were to check after we've done the accelerator code 
> > > selection pass, but it will happen at compile time. Another solution 
> > > would be to emit these as traps (poison + trap for value returning ones), 
> > > but I am concerned that it would lead to really fascinating debug 
> > > journeys.
> > > 
> > > Having said this, if there's a better way to deal with these scenarios, 
> > > it would be rather nice. Similarly, if the above doesn't make sense, 
> > > please let me know.
> > > 
> > Oh, I see; you "optimistically" compile everything assuming it might run on 
> > the accelerator, then run LLVM IR optimizations, then determine late which 
> > bits of code will actually run on the accelerator, which then prunes the 
> > code which shouldn't run.
> > 
> > I'm not sure I really like this... would it be possible to infer which 
> > functions need to be run on the accelerator based on the AST?  I mean, if 
> > your API takes a lambda expression that runs on the accelerator, you can 
> > mark the lambda's body as "must be emitted for GPU", then recursively mark 
> > all the functions referred to by the lambda.
> > 
> > Emiting errors lazily from the backend means you get different diagnostics 
> > depending on the optimization level.
> > 
> > If you do go with this codegen-based approach, it's not clear to me how you 
> > detect that a forbidden builtin was called; if you skip the error handling, 
> > you just get a literal "undef".
> `I'm not sure I really like this...` - actually, I am not a big fan either, 
> however I think it's about the best one can do, given the constraints 
> (consume standard C++, no annotations on the user side etc.). Having tried a 
> few times in the past (and at least once in a different compiler), I don't 
> quite think this can be done on an AST level. It would add some fairly 
> awkward checking during template instantiation (no way to know earlier that a 
> `CallableFoo` was passed to an offloadable algorithm), and it's a bit 
> unwieldy to basically compute the CFG on the AST and mark reachable Callees 
> at that point. Ignoring those, the main reason for which we cannot do this is 
> 

[PATCH] D155850: [Clang][CodeGen][RFC] Add codegen support for C++ Parallel Algorithm Offload

2023-08-08 Thread Matt Arsenault via Phabricator via cfe-commits
arsenm added inline comments.



Comment at: clang/lib/CodeGen/BackendUtil.cpp:1101-1102
+MPM.addPass(StdParAcceleratorCodeSelectionPass());
+}
+else if (LangOpts.HIPStdParInterposeAlloc) {
+  MPM.addPass(StdParAllocationInterpositionPass());

Formatting


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D155850/new/

https://reviews.llvm.org/D155850

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D155850: [Clang][CodeGen][RFC] Add codegen support for C++ Parallel Algorithm Offload

2023-08-08 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl accepted this revision.
yaxunl added a comment.

LGTM from HIP side. Thanks.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D155850/new/

https://reviews.llvm.org/D155850

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D155850: [Clang][CodeGen][RFC] Add codegen support for C++ Parallel Algorithm Offload

2023-08-08 Thread Alex Voicu via Phabricator via cfe-commits
AlexVlx added a comment.

In D155850#4570336 , @efriedma wrote:

> LGTM (but please don't merge until we reach consensus on the overall feature)

Of course, and thank you for the review. Please, do stick around if you don't 
mind, because this'll still get at least one update.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D155850/new/

https://reviews.llvm.org/D155850

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D155850: [Clang][CodeGen][RFC] Add codegen support for C++ Parallel Algorithm Offload

2023-08-08 Thread Eli Friedman via Phabricator via cfe-commits
efriedma accepted this revision.
efriedma added a comment.
This revision is now accepted and ready to land.

LGTM (but please don't merge until we reach consensus on the overall feature)


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D155850/new/

https://reviews.llvm.org/D155850

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D155850: [Clang][CodeGen][RFC] Add codegen support for C++ Parallel Algorithm Offload

2023-08-07 Thread Alex Voicu via Phabricator via cfe-commits
AlexVlx updated this revision to Diff 548022.
AlexVlx added a comment.

Extend handling of unsupported builtins to include dealing with the `target` 
attribute.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D155850/new/

https://reviews.llvm.org/D155850

Files:
  clang/lib/CodeGen/BackendUtil.cpp
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/lib/CodeGen/CodeGenFunction.cpp
  clang/lib/CodeGen/CodeGenModule.cpp
  clang/test/CodeGenStdPar/unannotated-functions-get-emitted.cpp
  clang/test/CodeGenStdPar/unsupported-builtins.cpp

Index: clang/test/CodeGenStdPar/unsupported-builtins.cpp
===
--- /dev/null
+++ clang/test/CodeGenStdPar/unsupported-builtins.cpp
@@ -0,0 +1,8 @@
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu \
+// RUN:   --stdpar -x hip -emit-llvm -fcuda-is-device -o - %s | FileCheck %s
+
+#define __global__ __attribute__((global))
+
+__global__ void foo() { return __builtin_ia32_pause(); }
+
+// CHECK: declare void @__builtin_ia32_pause__stdpar_unsupported()
Index: clang/test/CodeGenStdPar/unannotated-functions-get-emitted.cpp
===
--- /dev/null
+++ clang/test/CodeGenStdPar/unannotated-functions-get-emitted.cpp
@@ -0,0 +1,19 @@
+// RUN: %clang_cc1 -x hip -emit-llvm -fcuda-is-device \
+// RUN:   -o - %s | FileCheck --check-prefix=NO-STDPAR-DEV %s
+
+// RUN: %clang_cc1 --stdpar -emit-llvm -fcuda-is-device \
+// RUN:   -o - %s | FileCheck --check-prefix=STDPAR-DEV %s
+
+#define __device__ __attribute__((device))
+
+// NO-STDPAR-DEV-NOT: define {{.*}} void @_Z3fooPff({{.*}})
+// STDPAR-DEV: define {{.*}} void @_Z3fooPff({{.*}})
+void foo(float *a, float b) {
+  *a = b;
+}
+
+// NO-STDPAR-DEV: define {{.*}} void @_Z3barPff({{.*}})
+// STDPAR-DEV: define {{.*}} void @_Z3barPff({{.*}})
+__device__ void bar(float *a, float b) {
+  *a = b;
+}
Index: clang/lib/CodeGen/CodeGenModule.cpp
===
--- clang/lib/CodeGen/CodeGenModule.cpp
+++ clang/lib/CodeGen/CodeGenModule.cpp
@@ -3550,7 +3550,10 @@
   !Global->hasAttr() &&
   !Global->hasAttr() &&
   !Global->getType()->isCUDADeviceBuiltinSurfaceType() &&
-  !Global->getType()->isCUDADeviceBuiltinTextureType())
+  !Global->getType()->isCUDADeviceBuiltinTextureType() &&
+  !(LangOpts.HIPStdPar &&
+isa(Global) &&
+!Global->hasAttr()))
 return;
 } else {
   // We need to emit host-side 'shadows' for all global
Index: clang/lib/CodeGen/CodeGenFunction.cpp
===
--- clang/lib/CodeGen/CodeGenFunction.cpp
+++ clang/lib/CodeGen/CodeGenFunction.cpp
@@ -2594,10 +2594,15 @@
   std::string MissingFeature;
   llvm::StringMap CallerFeatureMap;
   CGM.getContext().getFunctionFeatureMap(CallerFeatureMap, FD);
+  // When compiling in StdPar mode we have to be conservative in rejecting
+  // target specific features in the FE, and defer the possible error to the
+  // AcceleratorCodeSelection pass, wherein iff an unsupported target builtin is
+  // referenced by an accelerator executable function, we emit an error.
+  bool IsStdPar = getLangOpts().HIPStdPar && getLangOpts().CUDAIsDevice
   if (BuiltinID) {
 StringRef FeatureList(CGM.getContext().BuiltinInfo.getRequiredFeatures(BuiltinID));
 if (!Builtin::evaluateRequiredTargetFeatures(
-FeatureList, CallerFeatureMap)) {
+FeatureList, CallerFeatureMap) && !IsStdPar) {
   CGM.getDiags().Report(Loc, diag::err_builtin_needs_feature)
   << TargetDecl->getDeclName()
   << FeatureList;
@@ -2630,7 +2635,7 @@
 return false;
   }
   return true;
-}))
+}) && !IsStdPar)
   CGM.getDiags().Report(Loc, diag::err_function_needs_feature)
   << FD->getDeclName() << TargetDecl->getDeclName() << MissingFeature;
   } else if (!FD->isMultiVersion() && FD->hasAttr()) {
@@ -2639,7 +2644,8 @@
 
 for (const auto &F : CalleeFeatureMap) {
   if (F.getValue() && (!CallerFeatureMap.lookup(F.getKey()) ||
-   !CallerFeatureMap.find(F.getKey())->getValue()))
+   !CallerFeatureMap.find(F.getKey())->getValue()) &&
+  !IsStdPar)
 CGM.getDiags().Report(Loc, diag::err_function_needs_feature)
 << FD->getDeclName() << TargetDecl->getDeclName() << F.getKey();
 }
Index: clang/lib/CodeGen/CGBuiltin.cpp
===
--- clang/lib/CodeGen/CGBuiltin.cpp
+++ clang/lib/CodeGen/CGBuiltin.cpp
@@ -2238,6 +2238,19 @@
   return nullptr;
 }
 
+static RValue EmitStdParUnsupportedBuiltin(CodeGenFunction *CGF,
+   const FunctionDecl *FD) {
+  auto Name = FD->getNameAsString() + "__stdpar_unsupported";
+  auto FnTy = CGF->CGM.getTypes().GetFunctionType(FD);
+

[PATCH] D155850: [Clang][CodeGen][RFC] Add codegen support for C++ Parallel Algorithm Offload

2023-08-03 Thread Alex Voicu via Phabricator via cfe-commits
AlexVlx updated this revision to Diff 547024.
AlexVlx added a comment.

Remove noise, correct style.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D155850/new/

https://reviews.llvm.org/D155850

Files:
  clang/lib/CodeGen/BackendUtil.cpp
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/lib/CodeGen/CodeGenModule.cpp
  clang/test/CodeGenStdPar/unannotated-functions-get-emitted.cpp
  clang/test/CodeGenStdPar/unsupported-builtins.cpp

Index: clang/test/CodeGenStdPar/unsupported-builtins.cpp
===
--- /dev/null
+++ clang/test/CodeGenStdPar/unsupported-builtins.cpp
@@ -0,0 +1,8 @@
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu \
+// RUN:   --stdpar -x hip -emit-llvm -fcuda-is-device -o - %s | FileCheck %s
+
+#define __global__ __attribute__((global))
+
+__global__ void foo() { return __builtin_ia32_pause(); }
+
+// CHECK: declare void @__builtin_ia32_pause__stdpar_unsupported()
Index: clang/test/CodeGenStdPar/unannotated-functions-get-emitted.cpp
===
--- /dev/null
+++ clang/test/CodeGenStdPar/unannotated-functions-get-emitted.cpp
@@ -0,0 +1,19 @@
+// RUN: %clang_cc1 -x hip -emit-llvm -fcuda-is-device \
+// RUN:   -o - %s | FileCheck --check-prefix=NO-STDPAR-DEV %s
+
+// RUN: %clang_cc1 --stdpar -emit-llvm -fcuda-is-device \
+// RUN:   -o - %s | FileCheck --check-prefix=STDPAR-DEV %s
+
+#define __device__ __attribute__((device))
+
+// NO-STDPAR-DEV-NOT: define {{.*}} void @_Z3fooPff({{.*}})
+// STDPAR-DEV: define {{.*}} void @_Z3fooPff({{.*}})
+void foo(float *a, float b) {
+  *a = b;
+}
+
+// NO-STDPAR-DEV: define {{.*}} void @_Z3barPff({{.*}})
+// STDPAR-DEV: define {{.*}} void @_Z3barPff({{.*}})
+__device__ void bar(float *a, float b) {
+  *a = b;
+}
Index: clang/lib/CodeGen/CodeGenModule.cpp
===
--- clang/lib/CodeGen/CodeGenModule.cpp
+++ clang/lib/CodeGen/CodeGenModule.cpp
@@ -3550,7 +3550,10 @@
   !Global->hasAttr() &&
   !Global->hasAttr() &&
   !Global->getType()->isCUDADeviceBuiltinSurfaceType() &&
-  !Global->getType()->isCUDADeviceBuiltinTextureType())
+  !Global->getType()->isCUDADeviceBuiltinTextureType() &&
+  !(LangOpts.HIPStdPar &&
+isa(Global) &&
+!Global->hasAttr()))
 return;
 } else {
   // We need to emit host-side 'shadows' for all global
Index: clang/lib/CodeGen/CGBuiltin.cpp
===
--- clang/lib/CodeGen/CGBuiltin.cpp
+++ clang/lib/CodeGen/CGBuiltin.cpp
@@ -2238,6 +2238,19 @@
   return nullptr;
 }
 
+static RValue EmitStdParUnsupportedBuiltin(CodeGenFunction *CGF,
+   const FunctionDecl *FD) {
+  auto Name = FD->getNameAsString() + "__stdpar_unsupported";
+  auto FnTy = CGF->CGM.getTypes().GetFunctionType(FD);
+  auto UBF = CGF->CGM.getModule().getOrInsertFunction(Name, FnTy);
+
+  SmallVector Args;
+  for (auto &&FormalTy : FnTy->params())
+Args.push_back(llvm::PoisonValue::get(FormalTy));
+
+  return RValue::get(CGF->Builder.CreateCall(UBF, Args));
+}
+
 RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
 const CallExpr *E,
 ReturnValueSlot ReturnValue) {
@@ -5545,6 +5558,9 @@
 llvm_unreachable("Bad evaluation kind in EmitBuiltinExpr");
   }
 
+  if (getLangOpts().HIPStdPar && getLangOpts().CUDAIsDevice)
+return EmitStdParUnsupportedBuiltin(this, FD);
+
   ErrorUnsupported(E, "builtin function");
 
   // Unknown builtin, for now just dump it out and return undef.
Index: clang/lib/CodeGen/BackendUtil.cpp
===
--- clang/lib/CodeGen/BackendUtil.cpp
+++ clang/lib/CodeGen/BackendUtil.cpp
@@ -77,6 +77,7 @@
 #include "llvm/Transforms/Scalar/EarlyCSE.h"
 #include "llvm/Transforms/Scalar/GVN.h"
 #include "llvm/Transforms/Scalar/JumpThreading.h"
+#include "llvm/Transforms/StdPar/StdPar.h"
 #include "llvm/Transforms/Utils/Debugify.h"
 #include "llvm/Transforms/Utils/EntryExitInstrumenter.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
@@ -1093,6 +1094,16 @@
   TheModule->addModuleFlag(Module::Error, "UnifiedLTO", uint32_t(1));
   }
 
+  if (LangOpts.HIPStdPar) {
+if (LangOpts.CUDAIsDevice) {
+  if (!TargetTriple.isAMDGCN())
+MPM.addPass(StdParAcceleratorCodeSelectionPass());
+}
+else if (LangOpts.HIPStdParInterposeAlloc) {
+  MPM.addPass(StdParAllocationInterpositionPass());
+}
+  }
+
   // Now that we have all of the passes ready, run them.
   {
 PrettyStackTraceString CrashInfo("Optimizer");
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D155850: [Clang][CodeGen][RFC] Add codegen support for C++ Parallel Algorithm Offload

2023-08-02 Thread Alex Voicu via Phabricator via cfe-commits
AlexVlx added inline comments.



Comment at: clang/lib/CodeGen/CodeGenModule.cpp:5315
+  !GV->isThreadLocal() &&
+  !(getLangOpts().HIPStdPar && getLangOpts().CUDAIsDevice)) {
 if (D->getTLSKind() == VarDecl::TLS_Dynamic)

efriedma wrote:
> You can't just pretend a thread-local variable isn't thread-local.  If the 
> intent here is that thread-local variables are illegal in device code, you 
> need to figure out some way to produce a diagnostic.  (Maybe by generating a 
> call to __stdpar_unsupported_threadlocal or something like that if code tries 
> to refer to such a variable.)
Oh, this is actually an error that slipped through, I botched the diff it 
appears, I'll correct it, apologies.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D155850/new/

https://reviews.llvm.org/D155850

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D155850: [Clang][CodeGen][RFC] Add codegen support for C++ Parallel Algorithm Offload

2023-08-02 Thread Eli Friedman via Phabricator via cfe-commits
efriedma added inline comments.



Comment at: clang/lib/CodeGen/CGBuiltin.cpp:5559
+return EmitStdParUnsupportedBuiltin(this, FD);
+  else
+ErrorUnsupported(E, "builtin function");

Else-after-return.



Comment at: clang/lib/CodeGen/CodeGenModule.cpp:5315
+  !GV->isThreadLocal() &&
+  !(getLangOpts().HIPStdPar && getLangOpts().CUDAIsDevice)) {
 if (D->getTLSKind() == VarDecl::TLS_Dynamic)

You can't just pretend a thread-local variable isn't thread-local.  If the 
intent here is that thread-local variables are illegal in device code, you need 
to figure out some way to produce a diagnostic.  (Maybe by generating a call to 
__stdpar_unsupported_threadlocal or something like that if code tries to refer 
to such a variable.)


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D155850/new/

https://reviews.llvm.org/D155850

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D155850: [Clang][CodeGen][RFC] Add codegen support for C++ Parallel Algorithm Offload

2023-07-27 Thread Alex Voicu via Phabricator via cfe-commits
AlexVlx updated this revision to Diff 544954.
AlexVlx removed a reviewer: eli.friedman.
AlexVlx added a comment.

This adds more ecumenical handling of unsupported builtins, as per the review 
discussion (a suffixed equivalent stub is emitted instead); it's paired with an 
associated change in accelerator code selection pass, where the actual check 
for these stubs occurs. I've also adjusted where the latter pass gets added to 
the `opt` pipeline, for the AMDGCN target; for the latter it's better, for the 
moment, to run it later because we essentially do LTCG, and therefore can 
unambiguously determine reachability by operating on the full module.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D155850/new/

https://reviews.llvm.org/D155850

Files:
  clang/lib/CodeGen/BackendUtil.cpp
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/lib/CodeGen/CodeGenModule.cpp
  clang/test/CodeGenStdPar/unannotated-functions-get-emitted.cpp
  clang/test/CodeGenStdPar/unsupported-builtins.cpp

Index: clang/test/CodeGenStdPar/unsupported-builtins.cpp
===
--- /dev/null
+++ clang/test/CodeGenStdPar/unsupported-builtins.cpp
@@ -0,0 +1,8 @@
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu \
+// RUN:   --stdpar -x hip -emit-llvm -fcuda-is-device -o - %s | FileCheck %s
+
+#define __global__ __attribute__((global))
+
+__global__ void foo() { return __builtin_ia32_pause(); }
+
+// CHECK: declare void @__builtin_ia32_pause__stdpar_unsupported()
Index: clang/test/CodeGenStdPar/unannotated-functions-get-emitted.cpp
===
--- /dev/null
+++ clang/test/CodeGenStdPar/unannotated-functions-get-emitted.cpp
@@ -0,0 +1,19 @@
+// RUN: %clang_cc1 -x hip -emit-llvm -fcuda-is-device \
+// RUN:   -o - %s | FileCheck --check-prefix=NO-STDPAR-DEV %s
+
+// RUN: %clang_cc1 --stdpar -emit-llvm -fcuda-is-device \
+// RUN:   -o - %s | FileCheck --check-prefix=STDPAR-DEV %s
+
+#define __device__ __attribute__((device))
+
+// NO-STDPAR-DEV-NOT: define {{.*}} void @_Z3fooPff({{.*}})
+// STDPAR-DEV: define {{.*}} void @_Z3fooPff({{.*}})
+void foo(float *a, float b) {
+  *a = b;
+}
+
+// NO-STDPAR-DEV: define {{.*}} void @_Z3barPff({{.*}})
+// STDPAR-DEV: define {{.*}} void @_Z3barPff({{.*}})
+__device__ void bar(float *a, float b) {
+  *a = b;
+}
Index: clang/lib/CodeGen/CodeGenModule.cpp
===
--- clang/lib/CodeGen/CodeGenModule.cpp
+++ clang/lib/CodeGen/CodeGenModule.cpp
@@ -3545,7 +3545,10 @@
   !Global->hasAttr() &&
   !Global->hasAttr() &&
   !Global->getType()->isCUDADeviceBuiltinSurfaceType() &&
-  !Global->getType()->isCUDADeviceBuiltinTextureType())
+  !Global->getType()->isCUDADeviceBuiltinTextureType() &&
+  !(LangOpts.HIPStdPar &&
+isa(Global) &&
+!Global->hasAttr()))
 return;
 } else {
   // We need to emit host-side 'shadows' for all global
@@ -5307,7 +5310,9 @@
 
   setNonAliasAttributes(D, GV);
 
-  if (D->getTLSKind() && !GV->isThreadLocal()) {
+  if (D->getTLSKind() &&
+  !GV->isThreadLocal() &&
+  !(getLangOpts().HIPStdPar && getLangOpts().CUDAIsDevice)) {
 if (D->getTLSKind() == VarDecl::TLS_Dynamic)
   CXXThreadLocals.push_back(D);
 setTLSMode(GV, *D);
Index: clang/lib/CodeGen/CGBuiltin.cpp
===
--- clang/lib/CodeGen/CGBuiltin.cpp
+++ clang/lib/CodeGen/CGBuiltin.cpp
@@ -2251,6 +2251,19 @@
   return nullptr;
 }
 
+static RValue EmitStdParUnsupportedBuiltin(CodeGenFunction *CGF,
+   const FunctionDecl *FD) {
+  auto Name = FD->getNameAsString() + "__stdpar_unsupported";
+  auto FnTy = CGF->CGM.getTypes().GetFunctionType(FD);
+  auto UBF = CGF->CGM.getModule().getOrInsertFunction(Name, FnTy);
+
+  SmallVector Args;
+  for (auto &&FormalTy : FnTy->params())
+Args.push_back(llvm::PoisonValue::get(FormalTy));
+
+  return RValue::get(CGF->Builder.CreateCall(UBF, Args));
+}
+
 RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
 const CallExpr *E,
 ReturnValueSlot ReturnValue) {
@@ -5541,7 +5554,10 @@
 llvm_unreachable("Bad evaluation kind in EmitBuiltinExpr");
   }
 
-  ErrorUnsupported(E, "builtin function");
+  if (getLangOpts().HIPStdPar && getLangOpts().CUDAIsDevice)
+return EmitStdParUnsupportedBuiltin(this, FD);
+  else
+ErrorUnsupported(E, "builtin function");
 
   // Unknown builtin, for now just dump it out and return undef.
   return GetUndefRValue(E->getType());
Index: clang/lib/CodeGen/BackendUtil.cpp
===
--- clang/lib/CodeGen/BackendUtil.cpp
+++ clang/lib/CodeGen/BackendUtil.cpp
@@ -77,6 +77,7 @@
 #include "llvm/Transfor

[PATCH] D155850: [Clang][CodeGen][RFC] Add codegen support for C++ Parallel Algorithm Offload

2023-07-21 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl added a comment.

In D155850#4523051 , @AlexVlx wrote:

> @yaxunl interesting point - are you worried about cases where due to missing 
> inlining / const prop an indirect call site that can be replaced with a 
> direct one would remain indirect? I think the problem in that case would 
> actually be different, in that possibly reachable functions would not be 
> identified as such and would be erroneously removed. I'm not sure there's any 
> case where we'd fail to remove a meant to be unreachable function. We can 
> definitely go with the `__clang_unsupported` approach, but I think I'd prefer 
> these to be compile time errors rather than remarks + runtime `printf`, not 
> in the least because `printf` adds some overhead. A way to ensure we don't 
> "miss a spot" might be to check after removal for any remaining unsupported 
> builtins, instead of doing it during reachability computation (this is 
> coupled with the special naming from the prior post).

For programs having multiple TUs we cannot decide whether an unsupported 
function is used by a kernel during the compilation of a single TU. We can only 
decide that when we have the IR for the whole program. Currently, the HIP 
toolchain uses LTO of lld for multiple TUs, I am not sure whether we can emit 
clang diagnostics from lld. If not, then we need to use remarks. If we are 
confident to remove most unreachable unsupported functions at -O0, we may not 
need to use printf at run time. Remarks at LTO should be sufficient.

  if (foundGPU())
func_use_amdgpu_builtin();
  else
func_use_x64_builtin();



In D155850#4523051 , @AlexVlx wrote:

> @yaxunl interesting point - are you worried about cases where due to missing 
> inlining / const prop an indirect call site that can be replaced with a 
> direct one would remain indirect? I think the problem in that case would 
> actually be different, in that possibly reachable functions would not be 
> identified as such and would be erroneously removed. I'm not sure there's any 
> case where we'd fail to remove a meant to be unreachable function. We can 
> definitely go with the `__clang_unsupported` approach, but I think I'd prefer 
> these to be compile time errors rather than remarks + runtime `printf`, not 
> in the least because `printf` adds some overhead. A way to ensure we don't 
> "miss a spot" might be to check after removal for any remaining unsupported 
> builtins, instead of doing it during reachability computation (this is 
> coupled with the special naming from the prior post).




Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D155850/new/

https://reviews.llvm.org/D155850

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D155850: [Clang][CodeGen][RFC] Add codegen support for C++ Parallel Algorithm Offload

2023-07-21 Thread Eli Friedman via Phabricator via cfe-commits
efriedma added a comment.

> In what regards how to do deferred diagnostics, it think it can be done like 
> this (I crossed streams in my prior reply when discussing this part, so it's 
> actually nonsense): instead of emitting undef here, we can emit a builtin 
> with the same signature, but with the name suffixed with e.g. 
> (`__stdpar_unsupported`) or something similar. Then, when doing the 
> reachability computation later, if we stumble upon a node in the CFG that 
> contains a builtin suffixed with `__stdpar_unsupported` we error out, and can 
> provide nice diagnostics since we'd have the call-chain handy. Thoughts?

Sure, something like that.  If you stick a SourceLocation on it, you can even 
recover the original clang source location.

> We can definitely go with the __clang_unsupported approach, but I think I'd 
> prefer these to be compile time errors rather than remarks + runtime printf, 
> not in the least because printf adds some overhead.

The overhead should be pretty minimal if the code doesn't actually run.

> So TL;DR, I think it would be more complex to do this on the AST and would 
> end up more brittle / less future proof.



> Since we need to support -O0

The biggest downside of working in the backend is that it becomes very hard for 
users to predict what will compile, and will not compile.  Particularly if you 
want to support -O0.  (I was sort of assuming you just wouldn't support -O0.)  
If you work on the AST, fewer constructs will be accepted, but you can actually 
define rules about which constructs will/will not be accepted.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D155850/new/

https://reviews.llvm.org/D155850

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D155850: [Clang][CodeGen][RFC] Add codegen support for C++ Parallel Algorithm Offload

2023-07-21 Thread Alex Voicu via Phabricator via cfe-commits
AlexVlx added a comment.

@yaxunl interesting point - are you worried about cases where due to missing 
inlining / const prop an indirect call site that can be replaced with a direct 
one would remain indirect? I think the problem in that case would actually be 
different, in that possibly reachable functions would not be identified as such 
and would be erroneously removed. I'm not sure there's any case where we'd fail 
to remove a meant to be unreachable function. We can definitely go with the 
`__clang_unsupported` approach, but I think I'd prefer these to be compile time 
errors rather than remarks + runtime `printf`, not in the least because 
`printf` adds some overhead. A way to ensure we don't "miss a spot" might be to 
check after removal for any remaining unsupported builtins, instead of doing it 
during reachability computation (this is coupled with the special naming from 
the prior post).


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D155850/new/

https://reviews.llvm.org/D155850

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D155850: [Clang][CodeGen][RFC] Add codegen support for C++ Parallel Algorithm Offload

2023-07-21 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl added a comment.

Since we need to support -O0, we need to be prepared that we may not be able to 
remove all the calls of unsupported functions even though they may never be 
called at run time.

We could simply replace them with traps in the middle end. This should work if 
such functions are not called at run time. The only issue is that if they are 
called at run time, how do we tell users that they used unsupported functions 
and where. A trap does not help since it only crashes the program without 
useful information.

We could emit calls of any unsupported functions as calls of 
`__clang_unsupported(file_name, line_number, function_name)`.

In the middle-end pass where we eliminate functions not referenced by kernels, 
we could emit reports about calls of `__clang_unsupported` under a certain -R 
option. We could turn on that option for `-stdpar` in clang driver.

We can emit printf of file_name, line_number and function_name for the first 
active lane then emit trap for a call of `__clang_unsupported(file_name, 
line_number, function_name)` under an option in the middle-end pass to 
facilitate users debugging their code.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D155850/new/

https://reviews.llvm.org/D155850

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D155850: [Clang][CodeGen][RFC] Add codegen support for C++ Parallel Algorithm Offload

2023-07-21 Thread Alex Voicu via Phabricator via cfe-commits
AlexVlx added inline comments.



Comment at: clang/lib/CodeGen/CGBuiltin.cpp:5542
+  if (!getLangOpts().HIPStdPar)
+ErrorUnsupported(E, "builtin function");
 

efriedma wrote:
> AlexVlx wrote:
> > efriedma wrote:
> > > This doesn't make sense; we can't just ignore bits of the source code.  I 
> > > guess this is related to "the decision on their validity is deferred", 
> > > but I don't see how you expect this to work.
> > This is one of the weirder parts, so let's consider the following example:
> > 
> > ```cpp
> > void foo() { __builtin_ia32_pause(); }
> > void bar() { __builtin_trap(); }
> > 
> > void baz(const vector& v) {
> > return for_each(par_unseq, cbegin(v), cend(v), [](auto&& x) { if (x == 
> > 42) bar(); });
> > }
> > ```
> > 
> > In the case above, what we'd offload to the accelerator, and ask the target 
> > BE to lower, is the implementation of `for_each`, and `bar`, because it is 
> > reachable from the latter. `foo` is not reachable by any execution path on 
> > the accelerator side, however it includes a builtin that is unsupported by 
> > the accelerator (unless said accelerator is x86, which is not impossible, 
> > but not something we're dealing with at the moment). If we were to actually 
> > error out early, in the FE, in these cases, there's almost no appeal to 
> > what is being proposed, because standard headers, as well as other 
> > libraries, are littered with various target specific builtins that are not 
> > going to be supported. This all builds on the core invariant of this model 
> > / extension / thingamabob, which is that the algorithms, and only the 
> > algorithms, are targets for offload. It thus follows that as long as code 
> > that is reachable from an algorithm's implementation is safe, all is fine, 
> > but we cannot know this in the FE / on an AST level, because we need the 
> > actual CFG. This part is handled in LLVM in the `SelectAcceleratorCodePass` 
> > that's in the last patch in this series.
> > 
> > Now, you will quite correctly observe that there's nothing preventing an 
> > user from calling `foo` in the callable they pass to an algorithm; they 
> > might read the docs / appreciate that this won't work, but even there they 
> > are not safe, because there via some opaque call chain they might end up 
> > touching some unsupported builtin. My intuition here, which is reflected 
> > above in letting builtins just flow through, is that such cases are better 
> > served with a compile time error, which is what will obtain once the target 
> > BE chokes trying to lower an unsupported builtin. It's not going to be a 
> > beautiful error, and we could probably prettify it somewhat if we were to 
> > check after we've done the accelerator code selection pass, but it will 
> > happen at compile time. Another solution would be to emit these as traps 
> > (poison + trap for value returning ones), but I am concerned that it would 
> > lead to really fascinating debug journeys.
> > 
> > Having said this, if there's a better way to deal with these scenarios, it 
> > would be rather nice. Similarly, if the above doesn't make sense, please 
> > let me know.
> > 
> Oh, I see; you "optimistically" compile everything assuming it might run on 
> the accelerator, then run LLVM IR optimizations, then determine late which 
> bits of code will actually run on the accelerator, which then prunes the code 
> which shouldn't run.
> 
> I'm not sure I really like this... would it be possible to infer which 
> functions need to be run on the accelerator based on the AST?  I mean, if 
> your API takes a lambda expression that runs on the accelerator, you can mark 
> the lambda's body as "must be emitted for GPU", then recursively mark all the 
> functions referred to by the lambda.
> 
> Emiting errors lazily from the backend means you get different diagnostics 
> depending on the optimization level.
> 
> If you do go with this codegen-based approach, it's not clear to me how you 
> detect that a forbidden builtin was called; if you skip the error handling, 
> you just get a literal "undef".
`I'm not sure I really like this...` - actually, I am not a big fan either, 
however I think it's about the best one can do, given the constraints (consume 
standard C++, no annotations on the user side etc.). Having tried a few times 
in the past (and at least once in a different compiler), I don't quite think 
this can be done on an AST level. It would add some fairly awkward checking 
during template instantiation (no way to know earlier that a `CallableFoo` was 
passed to an offloadable algorithm), and it's a bit unwieldy to basically 
compute the CFG on the AST and mark reachable Callees at that point. Ignoring 
those, the main reason for which we cannot do this is that the interface is not 
constrained to only take lambdas, but callables in general, and that includes 
pointers to function as well. We don't deal with those today, but plan to, and 
ther

[PATCH] D155850: [Clang][CodeGen][RFC] Add codegen support for C++ Parallel Algorithm Offload

2023-07-20 Thread Eli Friedman via Phabricator via cfe-commits
efriedma added inline comments.



Comment at: clang/lib/CodeGen/CGBuiltin.cpp:5542
+  if (!getLangOpts().HIPStdPar)
+ErrorUnsupported(E, "builtin function");
 

AlexVlx wrote:
> efriedma wrote:
> > This doesn't make sense; we can't just ignore bits of the source code.  I 
> > guess this is related to "the decision on their validity is deferred", but 
> > I don't see how you expect this to work.
> This is one of the weirder parts, so let's consider the following example:
> 
> ```cpp
> void foo() { __builtin_ia32_pause(); }
> void bar() { __builtin_trap(); }
> 
> void baz(const vector& v) {
> return for_each(par_unseq, cbegin(v), cend(v), [](auto&& x) { if (x == 
> 42) bar(); });
> }
> ```
> 
> In the case above, what we'd offload to the accelerator, and ask the target 
> BE to lower, is the implementation of `for_each`, and `bar`, because it is 
> reachable from the latter. `foo` is not reachable by any execution path on 
> the accelerator side, however it includes a builtin that is unsupported by 
> the accelerator (unless said accelerator is x86, which is not impossible, but 
> not something we're dealing with at the moment). If we were to actually error 
> out early, in the FE, in these cases, there's almost no appeal to what is 
> being proposed, because standard headers, as well as other libraries, are 
> littered with various target specific builtins that are not going to be 
> supported. This all builds on the core invariant of this model / extension / 
> thingamabob, which is that the algorithms, and only the algorithms, are 
> targets for offload. It thus follows that as long as code that is reachable 
> from an algorithm's implementation is safe, all is fine, but we cannot know 
> this in the FE / on an AST level, because we need the actual CFG. This part 
> is handled in LLVM in the `SelectAcceleratorCodePass` that's in the last 
> patch in this series.
> 
> Now, you will quite correctly observe that there's nothing preventing an user 
> from calling `foo` in the callable they pass to an algorithm; they might read 
> the docs / appreciate that this won't work, but even there they are not safe, 
> because there via some opaque call chain they might end up touching some 
> unsupported builtin. My intuition here, which is reflected above in letting 
> builtins just flow through, is that such cases are better served with a 
> compile time error, which is what will obtain once the target BE chokes 
> trying to lower an unsupported builtin. It's not going to be a beautiful 
> error, and we could probably prettify it somewhat if we were to check after 
> we've done the accelerator code selection pass, but it will happen at compile 
> time. Another solution would be to emit these as traps (poison + trap for 
> value returning ones), but I am concerned that it would lead to really 
> fascinating debug journeys.
> 
> Having said this, if there's a better way to deal with these scenarios, it 
> would be rather nice. Similarly, if the above doesn't make sense, please let 
> me know.
> 
Oh, I see; you "optimistically" compile everything assuming it might run on the 
accelerator, then run LLVM IR optimizations, then determine late which bits of 
code will actually run on the accelerator, which then prunes the code which 
shouldn't run.

I'm not sure I really like this... would it be possible to infer which 
functions need to be run on the accelerator based on the AST?  I mean, if your 
API takes a lambda expression that runs on the accelerator, you can mark the 
lambda's body as "must be emitted for GPU", then recursively mark all the 
functions referred to by the lambda.

Emiting errors lazily from the backend means you get different diagnostics 
depending on the optimization level.

If you do go with this codegen-based approach, it's not clear to me how you 
detect that a forbidden builtin was called; if you skip the error handling, you 
just get a literal "undef".


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D155850/new/

https://reviews.llvm.org/D155850

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D155850: [Clang][CodeGen][RFC] Add codegen support for C++ Parallel Algorithm Offload

2023-07-20 Thread Alex Voicu via Phabricator via cfe-commits
AlexVlx added inline comments.



Comment at: clang/lib/CodeGen/CGBuiltin.cpp:5542
+  if (!getLangOpts().HIPStdPar)
+ErrorUnsupported(E, "builtin function");
 

efriedma wrote:
> This doesn't make sense; we can't just ignore bits of the source code.  I 
> guess this is related to "the decision on their validity is deferred", but I 
> don't see how you expect this to work.
This is one of the weirder parts, so let's consider the following example:

```cpp
void foo() { __builtin_ia32_pause(); }
void bar() { __builtin_trap(); }

void baz(const vector& v) {
return for_each(par_unseq, cbegin(v), cend(v), [](auto&& x) { if (x == 42) 
bar(); });
}
```

In the case above, what we'd offload to the accelerator, and ask the target BE 
to lower, is the implementation of `for_each`, and `bar`, because it is 
reachable from the latter. `foo` is not reachable by any execution path on the 
accelerator side, however it includes a builtin that is unsupported by the 
accelerator (unless said accelerator is x86, which is not impossible, but not 
something we're dealing with at the moment). If we were to actually error out 
early, in the FE, in these cases, there's almost no appeal to what is being 
proposed, because standard headers, as well as other libraries, are littered 
with various target specific builtins that are not going to be supported. This 
all builds on the core invariant of this model / extension / thingamabob, which 
is that the algorithms, and only the algorithms, are targets for offload. It 
thus follows that as long as code that is reachable from an algorithm's 
implementation is safe, all is fine, but we cannot know this in the FE / on an 
AST level, because we need the actual CFG. This part is handled in LLVM in the 
`SelectAcceleratorCodePass` that's in the last patch in this series.

Now, you will quite correctly observe that there's nothing preventing an user 
from calling `foo` in the callable they pass to an algorithm; they might read 
the docs / appreciate that this won't work, but even there they are not safe, 
because there via some opaque call chain they might end up touching some 
unsupported builtin. My intuition here, which is reflected above in letting 
builtins just flow through, is that such cases are better served with a compile 
time error, which is what will obtain once the target BE chokes trying to lower 
an unsupported builtin. It's not going to be a beautiful error, and we could 
probably prettify it somewhat if we were to check after we've done the 
accelerator code selection pass, but it will happen at compile time. Another 
solution would be to emit these as traps (poison + trap for value returning 
ones), but I am concerned that it would lead to really fascinating debug 
journeys.

Having said this, if there's a better way to deal with these scenarios, it 
would be rather nice. Similarly, if the above doesn't make sense, please let me 
know.



Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D155850/new/

https://reviews.llvm.org/D155850

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D155850: [Clang][CodeGen][RFC] Add codegen support for C++ Parallel Algorithm Offload

2023-07-20 Thread Eli Friedman via Phabricator via cfe-commits
efriedma requested changes to this revision.
efriedma added inline comments.
This revision now requires changes to proceed.



Comment at: clang/lib/CodeGen/CGBuiltin.cpp:5542
+  if (!getLangOpts().HIPStdPar)
+ErrorUnsupported(E, "builtin function");
 

This doesn't make sense; we can't just ignore bits of the source code.  I guess 
this is related to "the decision on their validity is deferred", but I don't 
see how you expect this to work.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D155850/new/

https://reviews.llvm.org/D155850

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D155850: [Clang][CodeGen][RFC] Add codegen support for C++ Parallel Algorithm Offload

2023-07-20 Thread Alex Voicu via Phabricator via cfe-commits
AlexVlx created this revision.
AlexVlx added reviewers: yaxunl, rjmccall, eli.friedman, arsenm, tra, jlebar.
AlexVlx added a project: clang.
Herald added a subscriber: ormris.
Herald added a project: All.
AlexVlx requested review of this revision.
Herald added subscribers: cfe-commits, wdng.

This patch adds the CodeGen changes needed by the standard algorithm offload 
feature being proposed here: 
https://discourse.llvm.org/t/rfc-adding-c-parallel-algorithm-offload-support-to-clang-llvm/72159/1.
 The verbose documentation is included in the head of the patch series. This 
change concludes the set of additions needed in Clang, and essentially relaxes 
restrictions on what gets emitted on the device path, when compiling in 
`stdpar` mode (after the previous patch relaxed restrictions on what is 
semantically correct):

1. Unless a function is explicitly marked `__host__`, it will get emitted, 
whereas before only `__device__` and `__global__` functions would be emitted;
  - At the moment we special case `thread_local` handling and still do not emit 
them, as they will require more scaffolding that will be proposed at some point 
in the future.
2. Unsupported builtins are ignored as opposed to being marked as an error, as 
the decision on their validity is deferred to the `stdpar` specific code 
selection pass we are adding, which will be the topic of the final patch in 
this series;
3. We add the `stdpar` specific passes to the `opt` pipeline, independent of 
optimisation level:
  - When compiling for the accelerator / offload device, we add a code 
selection pass;
  - When compiling for the host, iff the user requested it via the 
`--stdpar-interpose-alloc` flag, we add a pass which replaces canonical 
allocation / deallocation functions with accelerator aware equivalents.

A test to validate that unannotated functions get correctly emitted is added as 
well. Please note that `__device__`, `__global__` and `__host__` are used to 
match existing nomenclature, they would not be present in user code.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D155850

Files:
  clang/lib/CodeGen/BackendUtil.cpp
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/lib/CodeGen/CodeGenModule.cpp
  clang/test/CodeGenStdPar/unannotated-functions-get-emitted.cpp


Index: clang/test/CodeGenStdPar/unannotated-functions-get-emitted.cpp
===
--- /dev/null
+++ clang/test/CodeGenStdPar/unannotated-functions-get-emitted.cpp
@@ -0,0 +1,19 @@
+// RUN: %clang_cc1 -x hip -emit-llvm -fcuda-is-device \
+// RUN:   -o - %s | FileCheck --check-prefix=NO-STDPAR-DEV %s
+
+// RUN: %clang_cc1 --stdpar -emit-llvm -fcuda-is-device \
+// RUN:   -o - %s | FileCheck --check-prefix=STDPAR-DEV %s
+
+#define __device__ __attribute__((device))
+
+// NO-STDPAR-DEV-NOT: define {{.*}} void @_Z3fooPff({{.*}})
+// STDPAR-DEV: define {{.*}} void @_Z3fooPff({{.*}})
+void foo(float *a, float b) {
+  *a = b;
+}
+
+// NO-STDPAR-DEV: define {{.*}} void @_Z3barPff({{.*}})
+// STDPAR-DEV: define {{.*}} void @_Z3barPff({{.*}})
+__device__ void bar(float *a, float b) {
+  *a = b;
+}
\ No newline at end of file
Index: clang/lib/CodeGen/CodeGenModule.cpp
===
--- clang/lib/CodeGen/CodeGenModule.cpp
+++ clang/lib/CodeGen/CodeGenModule.cpp
@@ -3545,7 +3545,12 @@
   !Global->hasAttr() &&
   !Global->hasAttr() &&
   !Global->getType()->isCUDADeviceBuiltinSurfaceType() &&
-  !Global->getType()->isCUDADeviceBuiltinTextureType())
+  !Global->getType()->isCUDADeviceBuiltinTextureType() &&
+  !(LangOpts.HIPStdPar &&
+isa(Global) &&
+!cast(Global)->getBuiltinID() &&
+!Global->hasAttr() &&
+!cast(Global)->isVariadic()))
 return;
 } else {
   // We need to emit host-side 'shadows' for all global
@@ -5310,7 +5315,9 @@
 
   setNonAliasAttributes(D, GV);
 
-  if (D->getTLSKind() && !GV->isThreadLocal()) {
+  if (D->getTLSKind() &&
+  !GV->isThreadLocal() &&
+  !(getLangOpts().HIPStdPar && getLangOpts().CUDAIsDevice)) {
 if (D->getTLSKind() == VarDecl::TLS_Dynamic)
   CXXThreadLocals.push_back(D);
 setTLSMode(GV, *D);
Index: clang/lib/CodeGen/CGBuiltin.cpp
===
--- clang/lib/CodeGen/CGBuiltin.cpp
+++ clang/lib/CodeGen/CGBuiltin.cpp
@@ -5538,7 +5538,8 @@
 llvm_unreachable("Bad evaluation kind in EmitBuiltinExpr");
   }
 
-  ErrorUnsupported(E, "builtin function");
+  if (!getLangOpts().HIPStdPar)
+ErrorUnsupported(E, "builtin function");
 
   // Unknown builtin, for now just dump it out and return undef.
   return GetUndefRValue(E->getType());
Index: clang/lib/CodeGen/BackendUtil.cpp
===
--- clang/lib/CodeGen/BackendUtil.cpp
+++ clang/lib/CodeGen/BackendUtil.cpp
@@ -77,6 +77,7 @@
 #include "llvm/T