[PATCH] D38134: [OpenCL] Emit enqueued block as kernel

2017-10-14 Thread Yaxun Liu via Phabricator via cfe-commits
This revision was automatically updated to reflect the committed changes.
Closed by commit rL315804: [OpenCL] Emit enqueued block as kernel (authored by 
yaxunl).

Changed prior to commit:
  https://reviews.llvm.org/D38134?vs=118795&id=119017#toc

Repository:
  rL LLVM

https://reviews.llvm.org/D38134

Files:
  cfe/trunk/lib/CodeGen/CGBlocks.cpp
  cfe/trunk/lib/CodeGen/CGBuiltin.cpp
  cfe/trunk/lib/CodeGen/CGOpenCLRuntime.cpp
  cfe/trunk/lib/CodeGen/CGOpenCLRuntime.h
  cfe/trunk/lib/CodeGen/CodeGenFunction.h
  cfe/trunk/lib/CodeGen/CodeGenTypes.h
  cfe/trunk/lib/CodeGen/TargetInfo.cpp
  cfe/trunk/lib/CodeGen/TargetInfo.h
  cfe/trunk/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
  cfe/trunk/test/CodeGenOpenCL/blocks.cl
  cfe/trunk/test/CodeGenOpenCL/cl20-device-side-enqueue.cl

Index: cfe/trunk/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
===
--- cfe/trunk/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
+++ cfe/trunk/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
@@ -0,0 +1,36 @@
+// RUN: %clang_cc1 %s -cl-std=CL2.0 -O0 -emit-llvm -o - -triple amdgcn | FileCheck %s --check-prefix=CHECK
+
+typedef struct {int a;} ndrange_t;
+
+// CHECK-LABEL: define amdgpu_kernel void @test
+kernel void test(global char *a, char b, global long *c, long d) {
+  queue_t default_queue;
+  unsigned flags = 0;
+  ndrange_t ndrange;
+
+  enqueue_kernel(default_queue, flags, ndrange,
+ ^(void) {
+ a[0] = b;
+ });
+
+  enqueue_kernel(default_queue, flags, ndrange,
+ ^(void) {
+ a[0] = b;
+ c[0] = d;
+ });
+}
+
+// CHECK-LABEL: define internal amdgpu_kernel void @__test_block_invoke_kernel(<{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>)
+// CHECK-SAME: #[[ATTR:[0-9]+]] !kernel_arg_addr_space !{{.*}} !kernel_arg_access_qual !{{.*}} !kernel_arg_type !{{.*}} !kernel_arg_base_type !{{.*}} !kernel_arg_type_qual !{{.*}}
+// CHECK: entry:
+// CHECK:  %1 = alloca <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>, align 8
+// CHECK:  store <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }> %0, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>* %1, align 8
+// CHECK:  %2 = addrspacecast <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>* %1 to i8 addrspace(4)*
+// CHECK:  call void @__test_block_invoke(i8 addrspace(4)* %2)
+// CHECK:  ret void
+// CHECK:}
+
+// CHECK-LABEL: define internal amdgpu_kernel void @__test_block_invoke_2_kernel(<{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>)
+// CHECK-SAME: #[[ATTR]] !kernel_arg_addr_space !{{.*}} !kernel_arg_access_qual !{{.*}} !kernel_arg_type !{{.*}} !kernel_arg_base_type !{{.*}} !kernel_arg_type_qual !{{.*}}
+
+// CHECK: attributes #[[ATTR]] = { nounwind "enqueued-block" }
Index: cfe/trunk/test/CodeGenOpenCL/cl20-device-side-enqueue.cl
===
--- cfe/trunk/test/CodeGenOpenCL/cl20-device-side-enqueue.cl
+++ cfe/trunk/test/CodeGenOpenCL/cl20-device-side-enqueue.cl
@@ -6,10 +6,30 @@
 typedef void (^bl_t)(local void *);
 typedef struct {int a;} ndrange_t;
 
-// N.B. The check here only exists to set BL_GLOBAL
-// COMMON: @block_G =  addrspace(1) constant void (i8 addrspace(3)*) addrspace(4)* addrspacecast (void (i8 addrspace(3)*) addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_GLOBAL:@__block_literal_global(\.[0-9]+)?]] to void (i8 addrspace(3)*) addrspace(1)*) to void (i8 addrspace(3)*) addrspace(4)*)
+// COMMON: %struct.__opencl_block_literal_generic = type { i32, i32, i8 addrspace(4)* }
+
+// For a block global variable, first emit the block literal as a global variable, then emit the block variable itself.
+// COMMON: [[BL_GLOBAL:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INV_G:@[^ ]+]] to i8*) to i8 addrspace(4)*) }
+// COMMON: @block_G =  addrspace(1) constant void (i8 addrspace(3)*) addrspace(4)* addrspacecast (void (i8 addrspace(3)*) addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_GLOBAL]] to void (i8 addrspace(3)*) addrspace(1)*) to void (i8 addrspace(3)*) addrspace(4)*)
+
+// For anonymous blocks without captures, emit block literals as global variable.
+// COMMON: [[BLG1:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* {{@[^ ]+}} to i8*) to i8 addrspace(4)*) }
+// COMMON: [[BLG2:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* {{@[^ ]+}} to i8*) to i8 addrspa

[PATCH] D38134: [OpenCL] Emit enqueued block as kernel

2017-10-13 Thread Anastasia Stulova via Phabricator via cfe-commits
Anastasia accepted this revision.
Anastasia added a comment.
This revision is now accepted and ready to land.

LGTM! Great work! Thanks!


https://reviews.llvm.org/D38134



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D38134: [OpenCL] Emit enqueued block as kernel

2017-10-12 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl added a comment.

In https://reviews.llvm.org/D38134#895848, @Anastasia wrote:

> I think it would be good to add a block test to CodeGenOpenCL where we would 
> just call the block without any enqueue and check that the invoke function is 
> generated but the kernel wrapper isn't.


we have test/CodeGenOpenCL/blocks.cl which only calls a block. I can add check 
to make sure no kernels generated.




Comment at: lib/CodeGen/CGBuiltin.cpp:2846
+  PtrToSizeArray};
+  std::vector ArgTys = {QueueTy,
+  IntTy,

Anastasia wrote:
> Formatting seems inconsistent from above.
Will fix.



Comment at: lib/CodeGen/CodeGenFunction.h:2921
 private:
-  /// Helpers for blocks
-  llvm::Value *EmitBlockLiteral(const CGBlockInfo &Info);
+  /// Helpers for blocks. Returns invoke function by \p InvokeF if it is not
+  /// nullptr.

Anastasia wrote:
> It will be nullptr in case block is not enqueued? May be it's worth 
> explaining it in the comment.
Will do.



Comment at: lib/CodeGen/TargetInfo.h:290
   }
+  /// Create an OpenCL kernel for an enqueued block.
+  virtual llvm::Function *

Anastasia wrote:
> Can we also explain the wrapper kernel here?
Will do.


https://reviews.llvm.org/D38134



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D38134: [OpenCL] Emit enqueued block as kernel

2017-10-12 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl updated this revision to Diff 118795.
yaxunl marked 5 inline comments as done.
yaxunl added a comment.

Revised by Anastasia's comments.


https://reviews.llvm.org/D38134

Files:
  lib/CodeGen/CGBlocks.cpp
  lib/CodeGen/CGBuiltin.cpp
  lib/CodeGen/CGOpenCLRuntime.cpp
  lib/CodeGen/CGOpenCLRuntime.h
  lib/CodeGen/CodeGenFunction.h
  lib/CodeGen/CodeGenTypes.h
  lib/CodeGen/TargetInfo.cpp
  lib/CodeGen/TargetInfo.h
  test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
  test/CodeGenOpenCL/blocks.cl
  test/CodeGenOpenCL/cl20-device-side-enqueue.cl

Index: test/CodeGenOpenCL/cl20-device-side-enqueue.cl
===
--- test/CodeGenOpenCL/cl20-device-side-enqueue.cl
+++ test/CodeGenOpenCL/cl20-device-side-enqueue.cl
@@ -6,10 +6,30 @@
 typedef void (^bl_t)(local void *);
 typedef struct {int a;} ndrange_t;
 
-// N.B. The check here only exists to set BL_GLOBAL
-// COMMON: @block_G =  addrspace(1) constant void (i8 addrspace(3)*) addrspace(4)* addrspacecast (void (i8 addrspace(3)*) addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_GLOBAL:@__block_literal_global(\.[0-9]+)?]] to void (i8 addrspace(3)*) addrspace(1)*) to void (i8 addrspace(3)*) addrspace(4)*)
+// COMMON: %struct.__opencl_block_literal_generic = type { i32, i32, i8 addrspace(4)* }
+
+// For a block global variable, first emit the block literal as a global variable, then emit the block variable itself.
+// COMMON: [[BL_GLOBAL:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INV_G:@[^ ]+]] to i8*) to i8 addrspace(4)*) }
+// COMMON: @block_G =  addrspace(1) constant void (i8 addrspace(3)*) addrspace(4)* addrspacecast (void (i8 addrspace(3)*) addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_GLOBAL]] to void (i8 addrspace(3)*) addrspace(1)*) to void (i8 addrspace(3)*) addrspace(4)*)
+
+// For anonymous blocks without captures, emit block literals as global variable.
+// COMMON: [[BLG1:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* {{@[^ ]+}} to i8*) to i8 addrspace(4)*) }
+// COMMON: [[BLG2:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* {{@[^ ]+}} to i8*) to i8 addrspace(4)*) }
+// COMMON: [[BLG3:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* {{@[^ ]+}} to i8*) to i8 addrspace(4)*) }
+// COMMON: [[BLG4:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* {{@[^ ]+}} to i8*) to i8 addrspace(4)*) }
+// COMMON: [[BLG5:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* {{@[^ ]+}} to i8*) to i8 addrspace(4)*) }
+// COMMON: [[BLG6:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*, i8 addrspace(3)*, i8 addrspace(3)*)* {{@[^ ]+}} to i8*) to i8 addrspace(4)*) }
+// COMMON: [[BLG7:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* {{@[^ ]+}} to i8*) to i8 addrspace(4)*) }
+// COMMON: [[BLG8:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* [[INVG8:@[^ ]+]] to i8*) to i8 addrspace(4)*) }
+// COMMON: [[BLG9:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INVG9:@[^ ]+]] to i8*) to i8 addrspace(4)*) }
+// COMMON: [[BLG10:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* {{@[^ ]+}} to i8*) to i8 addrspace(4)*) }
+// COMMON: [[BLG11:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { 

[PATCH] D38134: [OpenCL] Emit enqueued block as kernel

2017-10-12 Thread Anastasia Stulova via Phabricator via cfe-commits
Anastasia added a comment.

I think it would be good to add a block test to CodeGenOpenCL where we would 
just call the block without any enqueue and check that the invoke function is 
generated but the kernel wrapper isn't.




Comment at: lib/CodeGen/CGBuiltin.cpp:2846
+  PtrToSizeArray};
+  std::vector ArgTys = {QueueTy,
+  IntTy,

Formatting seems inconsistent from above.



Comment at: lib/CodeGen/CodeGenFunction.h:2921
 private:
-  /// Helpers for blocks
-  llvm::Value *EmitBlockLiteral(const CGBlockInfo &Info);
+  /// Helpers for blocks. Returns invoke function by \p InvokeF if it is not
+  /// nullptr.

It will be nullptr in case block is not enqueued? May be it's worth explaining 
it in the comment.



Comment at: lib/CodeGen/TargetInfo.h:290
   }
+  /// Create an OpenCL kernel for an enqueued block.
+  virtual llvm::Function *

Can we also explain the wrapper kernel here?


https://reviews.llvm.org/D38134



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D38134: [OpenCL] Emit enqueued block as kernel

2017-10-11 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl updated this revision to Diff 118677.
yaxunl marked 2 inline comments as done.
yaxunl added a comment.

Revised by Anastasia's comments. Get block invoke function by API instead of 
iterate through IR's. Pass the block kernel directly to `__enqueu_kernel 
functions`.


https://reviews.llvm.org/D38134

Files:
  lib/CodeGen/CGBlocks.cpp
  lib/CodeGen/CGBuiltin.cpp
  lib/CodeGen/CGOpenCLRuntime.cpp
  lib/CodeGen/CGOpenCLRuntime.h
  lib/CodeGen/CodeGenFunction.h
  lib/CodeGen/CodeGenTypes.h
  lib/CodeGen/TargetInfo.cpp
  lib/CodeGen/TargetInfo.h
  test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
  test/CodeGenOpenCL/cl20-device-side-enqueue.cl

Index: test/CodeGenOpenCL/cl20-device-side-enqueue.cl
===
--- test/CodeGenOpenCL/cl20-device-side-enqueue.cl
+++ test/CodeGenOpenCL/cl20-device-side-enqueue.cl
@@ -6,10 +6,30 @@
 typedef void (^bl_t)(local void *);
 typedef struct {int a;} ndrange_t;
 
-// N.B. The check here only exists to set BL_GLOBAL
-// COMMON: @block_G =  addrspace(1) constant void (i8 addrspace(3)*) addrspace(4)* addrspacecast (void (i8 addrspace(3)*) addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_GLOBAL:@__block_literal_global(\.[0-9]+)?]] to void (i8 addrspace(3)*) addrspace(1)*) to void (i8 addrspace(3)*) addrspace(4)*)
+// COMMON: %struct.__opencl_block_literal_generic = type { i32, i32, i8 addrspace(4)* }
+
+// For a block global variable, first emit the block literal as a global variable, then emit the block variable itself.
+// COMMON: [[BL_GLOBAL:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INV_G:@[^ ]+]] to i8*) to i8 addrspace(4)*) }
+// COMMON: @block_G =  addrspace(1) constant void (i8 addrspace(3)*) addrspace(4)* addrspacecast (void (i8 addrspace(3)*) addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_GLOBAL]] to void (i8 addrspace(3)*) addrspace(1)*) to void (i8 addrspace(3)*) addrspace(4)*)
+
+// For anonymous blocks without captures, emit block literals as global variable.
+// COMMON: [[BLG1:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* {{@[^ ]+}} to i8*) to i8 addrspace(4)*) }
+// COMMON: [[BLG2:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* {{@[^ ]+}} to i8*) to i8 addrspace(4)*) }
+// COMMON: [[BLG3:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* {{@[^ ]+}} to i8*) to i8 addrspace(4)*) }
+// COMMON: [[BLG4:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* {{@[^ ]+}} to i8*) to i8 addrspace(4)*) }
+// COMMON: [[BLG5:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* {{@[^ ]+}} to i8*) to i8 addrspace(4)*) }
+// COMMON: [[BLG6:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*, i8 addrspace(3)*, i8 addrspace(3)*)* {{@[^ ]+}} to i8*) to i8 addrspace(4)*) }
+// COMMON: [[BLG7:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* {{@[^ ]+}} to i8*) to i8 addrspace(4)*) }
+// COMMON: [[BLG8:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* [[INVG8:@[^ ]+]] to i8*) to i8 addrspace(4)*) }
+// COMMON: [[BLG9:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INVG9:@[^ ]+]] to i8*) to i8 addrspace(4)*) }
+// COMMON: [[BLG10:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* {{@[^ ]+}} to i8*) to i8 add

[PATCH] D38134: [OpenCL] Emit enqueued block as kernel

2017-10-10 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl marked 4 inline comments as done.
yaxunl added inline comments.



Comment at: lib/CodeGen/CGOpenCLRuntime.cpp:144
+  if (auto *I = dyn_cast(V)) {
+// If the block literal is emitted as an instruction, it is an alloca
+// and the block invoke function is stored to GEP of this alloca.

Anastasia wrote:
> yaxunl wrote:
> > Anastasia wrote:
> > > Why do we need to replace original block calls with the kernels? I think 
> > > in case of calling a block we could use the original block function and 
> > > only for enqueue use the kernel that would call the block function 
> > > inside. The pointer to the kernel wrapper could be passed as an 
> > > additional parameter to `enqueue_kernel` calls. We won't need to iterate 
> > > through all IR then.
> > `CGF.EmitScalarExpr(Block)` returns the block literal structure which 
> > contains the size/align/invoke_function/captures. The block invoke function 
> > is stored to the struct by a `StoreInst`. To create the wrapper kernel, we 
> > need to get the block invoke function, therefore we have to iterate through 
> > IR.
> > 
> > Since we need to find the store instruction any way, it is simpler to just 
> > replace the stored function with the kernel and pass the block literal 
> > struct, instead of passing the kernel separately.
> So we cann't get the invoke function from the block literal structure passed 
> into the kernel wrapper directly knowing its offset? Iterating through IR 
> adds extra time and also I am not sure how reliable this is wrt different 
> corner cases of IR.
Unfortunately the invoke function is not returned directly. Instead, it is 
buried in an LLVM value. And to extract the invoke function from the LLVM value 
we have to wade through a bunch of LLVM IRs.

There is one way to get the invoke function directly instead of going through 
IRs, but we need to change the functions for generating code for the blocks a 
little bit so that they return the block invoke function.


https://reviews.llvm.org/D38134



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D38134: [OpenCL] Emit enqueued block as kernel

2017-10-10 Thread Anastasia Stulova via Phabricator via cfe-commits
Anastasia added inline comments.



Comment at: lib/CodeGen/CGOpenCLRuntime.cpp:144
+  if (auto *I = dyn_cast(V)) {
+// If the block literal is emitted as an instruction, it is an alloca
+// and the block invoke function is stored to GEP of this alloca.

yaxunl wrote:
> Anastasia wrote:
> > Why do we need to replace original block calls with the kernels? I think in 
> > case of calling a block we could use the original block function and only 
> > for enqueue use the kernel that would call the block function inside. The 
> > pointer to the kernel wrapper could be passed as an additional parameter to 
> > `enqueue_kernel` calls. We won't need to iterate through all IR then.
> `CGF.EmitScalarExpr(Block)` returns the block literal structure which 
> contains the size/align/invoke_function/captures. The block invoke function 
> is stored to the struct by a `StoreInst`. To create the wrapper kernel, we 
> need to get the block invoke function, therefore we have to iterate through 
> IR.
> 
> Since we need to find the store instruction any way, it is simpler to just 
> replace the stored function with the kernel and pass the block literal 
> struct, instead of passing the kernel separately.
So we cann't get the invoke function from the block literal structure passed 
into the kernel wrapper directly knowing its offset? Iterating through IR adds 
extra time and also I am not sure how reliable this is wrt different corner 
cases of IR.



Comment at: lib/CodeGen/TargetInfo.cpp:8949
+  Builder.restoreIP(IP);
+  return F;
+}

yaxunl wrote:
> Anastasia wrote:
> > Wondering if we should add the kernel metadata (w/o args) since it was used 
> > for long time to indicate the kernel.
> Currently (before this change), clang already does not generate kernel 
> metadata if there is no vec_type_hint, work_group_size_hint, 
> reqd_work_group_size. Remember last time we made the change to use function 
> metadata to represent these attributes. Whether a function is a kernel can be 
> determined by its calling convention.
Ok, let's leave it for now. We can always add it in on request. 



Comment at: test/CodeGenOpenCL/cl20-device-side-enqueue.cl:297
+// COMMON: define internal spir_kernel void [[INVG5]](i8 addrspace(4)*{{.*}}, 
i8 addrspace(3)*{{.*}})
+// COMMON: define internal spir_kernel void [[INVG6]](i8 addrspace(4)*{{.*}}, 
i8 addrspace(3)*{{.*}}, i8 addrspace(3)*{{.*}}, i8 addrspace(3)*{{.*}})
+// COMMON: define internal spir_kernel void [[INVG7]](i8 addrspace(4)*{{.*}}, 
i8 addrspace(3)*{{.*}})

Perhaps we could check the body of this one too since it has a different 
prototype.


https://reviews.llvm.org/D38134



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D38134: [OpenCL] Emit enqueued block as kernel

2017-10-06 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl updated this revision to Diff 118064.
yaxunl marked 5 inline comments as done.
yaxunl added a comment.

Revise by Anastasia's comments.


https://reviews.llvm.org/D38134

Files:
  lib/CodeGen/CGBuiltin.cpp
  lib/CodeGen/CGOpenCLRuntime.cpp
  lib/CodeGen/CGOpenCLRuntime.h
  lib/CodeGen/CodeGenTypes.h
  lib/CodeGen/TargetInfo.cpp
  lib/CodeGen/TargetInfo.h
  test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
  test/CodeGenOpenCL/cl20-device-side-enqueue.cl

Index: test/CodeGenOpenCL/cl20-device-side-enqueue.cl
===
--- test/CodeGenOpenCL/cl20-device-side-enqueue.cl
+++ test/CodeGenOpenCL/cl20-device-side-enqueue.cl
@@ -6,10 +6,33 @@
 typedef void (^bl_t)(local void *);
 typedef struct {int a;} ndrange_t;
 
-// N.B. The check here only exists to set BL_GLOBAL
-// COMMON: @block_G =  addrspace(1) constant void (i8 addrspace(3)*) addrspace(4)* addrspacecast (void (i8 addrspace(3)*) addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_GLOBAL:@__block_literal_global(\.[0-9]+)?]] to void (i8 addrspace(3)*) addrspace(1)*) to void (i8 addrspace(3)*) addrspace(4)*)
+// COMMON: %struct.__opencl_block_literal_generic = type { i32, i32, i8 addrspace(4)* }
+
+// For a block global variable, first emit the block literal as a global variable, then emit the block variable itself.
+// COMMON: [[BL_GLOBAL:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INV_G:@[^ ]+]] to i8*) to i8 addrspace(4)*) }
+// COMMON: @block_G =  addrspace(1) constant void (i8 addrspace(3)*) addrspace(4)* addrspacecast (void (i8 addrspace(3)*) addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_GLOBAL]] to void (i8 addrspace(3)*) addrspace(1)*) to void (i8 addrspace(3)*) addrspace(4)*)
+
+// For anonymous blocks without captures, emit block literals as global variable.
+// COMMON: [[BLG1:@__opencl_enqueued_block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INVG1:@[^ ]+_kernel]] to i8*) to i8 addrspace(4)*) }
+// COMMON: [[BLG2:@__opencl_enqueued_block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INVG2:@[^ ]+_kernel]] to i8*) to i8 addrspace(4)*) }
+// COMMON: [[BLG3:@__opencl_enqueued_block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INVG3:@[^ ]+_kernel]] to i8*) to i8 addrspace(4)*) }
+// COMMON: [[BLG4:@__opencl_enqueued_block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INVG4:@[^ ]+_kernel]] to i8*) to i8 addrspace(4)*) }
+// COMMON: [[BLG5:@__opencl_enqueued_block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INVG5:@[^ ]+_kernel]] to i8*) to i8 addrspace(4)*) }
+// COMMON: [[BLG6:@__opencl_enqueued_block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*, i8 addrspace(3)*, i8 addrspace(3)*)* [[INVG6:@[^ ]+_kernel]] to i8*) to i8 addrspace(4)*) }
+// COMMON: [[BLG7:@__opencl_enqueued_block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INVG7:@[^ ]+_kernel]] to i8*) to i8 addrspace(4)*) }
+// COMMON: [[BLG8:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* [[INVG8:@[^ ]+]] to i8*) to i8 addrspace(4)*) }
+// COMMON: [[BLG9:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INVG9:@[^ ]+]] to i8*) to i8 addrspace(4)*) }
+// COMMON: [[BLG8K:@__opencl_enqueued_block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* 

[PATCH] D38134: [OpenCL] Emit enqueued block as kernel

2017-10-06 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl marked 6 inline comments as done.
yaxunl added inline comments.



Comment at: lib/CodeGen/CGOpenCLRuntime.cpp:144
+  if (auto *I = dyn_cast(V)) {
+// If the block literal is emitted as an instruction, it is an alloca
+// and the block invoke function is stored to GEP of this alloca.

Anastasia wrote:
> Why do we need to replace original block calls with the kernels? I think in 
> case of calling a block we could use the original block function and only for 
> enqueue use the kernel that would call the block function inside. The pointer 
> to the kernel wrapper could be passed as an additional parameter to 
> `enqueue_kernel` calls. We won't need to iterate through all IR then.
`CGF.EmitScalarExpr(Block)` returns the block literal structure which contains 
the size/align/invoke_function/captures. The block invoke function is stored to 
the struct by a `StoreInst`. To create the wrapper kernel, we need to get the 
block invoke function, therefore we have to iterate through IR.

Since we need to find the store instruction any way, it is simpler to just 
replace the stored function with the kernel and pass the block literal struct, 
instead of passing the kernel separately.



Comment at: lib/CodeGen/TargetInfo.cpp:8927
+llvm::Function *
+TargetCodeGenInfo::createEnqueuedBlockKernel(CodeGenFunction &CGF,
+ llvm::Function *Invoke,

Anastasia wrote:
> Could you add some comments please?
Will do.



Comment at: lib/CodeGen/TargetInfo.cpp:8949
+  Builder.restoreIP(IP);
+  return F;
+}

Anastasia wrote:
> Wondering if we should add the kernel metadata (w/o args) since it was used 
> for long time to indicate the kernel.
Currently (before this change), clang already does not generate kernel metadata 
if there is no vec_type_hint, work_group_size_hint, reqd_work_group_size. 
Remember last time we made the change to use function metadata to represent 
these attributes. Whether a function is a kernel can be determined by its 
calling convention.



Comment at: lib/CodeGen/TargetInfo.h:35
 class Decl;
+class ASTContext;
 

Anastasia wrote:
> Do we need this?
Will remove it.



Comment at: test/CodeGenOpenCL/cl20-device-side-enqueue.cl:9
 
-// N.B. The check here only exists to set BL_GLOBAL
-// COMMON: @block_G =  addrspace(1) constant void (i8 addrspace(3)*) 
addrspace(4)* addrspacecast (void (i8 addrspace(3)*) addrspace(1)* bitcast ({ 
i32, i32, i8 addrspace(4)* } addrspace(1)* 
[[BL_GLOBAL:@__block_literal_global(\.[0-9]+)?]] to void (i8 addrspace(3)*) 
addrspace(1)*) to void (i8 addrspace(3)*) addrspace(4)*)
+// COMMON: %struct.__opencl_block_literal_generic = type { i32, i32, i8 
addrspace(4)* }
+

Anastasia wrote:
> Can we check generated kernel function too?
will do.


https://reviews.llvm.org/D38134



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D38134: [OpenCL] Emit enqueued block as kernel

2017-10-06 Thread Anastasia Stulova via Phabricator via cfe-commits
Anastasia added inline comments.



Comment at: lib/CodeGen/CGOpenCLRuntime.cpp:144
+  if (auto *I = dyn_cast(V)) {
+// If the block literal is emitted as an instruction, it is an alloca
+// and the block invoke function is stored to GEP of this alloca.

Why do we need to replace original block calls with the kernels? I think in 
case of calling a block we could use the original block function and only for 
enqueue use the kernel that would call the block function inside. The pointer 
to the kernel wrapper could be passed as an additional parameter to 
`enqueue_kernel` calls. We won't need to iterate through all IR then.



Comment at: lib/CodeGen/TargetInfo.cpp:8927
+llvm::Function *
+TargetCodeGenInfo::createEnqueuedBlockKernel(CodeGenFunction &CGF,
+ llvm::Function *Invoke,

Could you add some comments please?



Comment at: lib/CodeGen/TargetInfo.cpp:8949
+  Builder.restoreIP(IP);
+  return F;
+}

Wondering if we should add the kernel metadata (w/o args) since it was used for 
long time to indicate the kernel.



Comment at: lib/CodeGen/TargetInfo.h:35
 class Decl;
+class ASTContext;
 

Do we need this?



Comment at: test/CodeGenOpenCL/cl20-device-side-enqueue.cl:9
 
-// N.B. The check here only exists to set BL_GLOBAL
-// COMMON: @block_G =  addrspace(1) constant void (i8 addrspace(3)*) 
addrspace(4)* addrspacecast (void (i8 addrspace(3)*) addrspace(1)* bitcast ({ 
i32, i32, i8 addrspace(4)* } addrspace(1)* 
[[BL_GLOBAL:@__block_literal_global(\.[0-9]+)?]] to void (i8 addrspace(3)*) 
addrspace(1)*) to void (i8 addrspace(3)*) addrspace(4)*)
+// COMMON: %struct.__opencl_block_literal_generic = type { i32, i32, i8 
addrspace(4)* }
+

Can we check generated kernel function too?


https://reviews.llvm.org/D38134



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D38134: [OpenCL] Emit enqueued block as kernel

2017-10-04 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl updated this revision to Diff 117739.
yaxunl marked an inline comment as done.
yaxunl edited the summary of this revision.
yaxunl added a comment.

Emit enqueued block as a wrapper kernel which calls the block invoke function. 
Added test for calling and enqueue the same block.


https://reviews.llvm.org/D38134

Files:
  lib/CodeGen/CGBuiltin.cpp
  lib/CodeGen/CGOpenCLRuntime.cpp
  lib/CodeGen/CGOpenCLRuntime.h
  lib/CodeGen/CodeGenTypes.h
  lib/CodeGen/TargetInfo.cpp
  lib/CodeGen/TargetInfo.h
  test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
  test/CodeGenOpenCL/cl20-device-side-enqueue.cl

Index: test/CodeGenOpenCL/cl20-device-side-enqueue.cl
===
--- test/CodeGenOpenCL/cl20-device-side-enqueue.cl
+++ test/CodeGenOpenCL/cl20-device-side-enqueue.cl
@@ -6,10 +6,33 @@
 typedef void (^bl_t)(local void *);
 typedef struct {int a;} ndrange_t;
 
-// N.B. The check here only exists to set BL_GLOBAL
-// COMMON: @block_G =  addrspace(1) constant void (i8 addrspace(3)*) addrspace(4)* addrspacecast (void (i8 addrspace(3)*) addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_GLOBAL:@__block_literal_global(\.[0-9]+)?]] to void (i8 addrspace(3)*) addrspace(1)*) to void (i8 addrspace(3)*) addrspace(4)*)
+// COMMON: %struct.__opencl_block_literal_generic = type { i32, i32, i8 addrspace(4)* }
+
+// For a block global variable, first emit the block literal as a global variable, then emit the block variable itself.
+// COMMON: [[BL_GLOBAL:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INV_G:@[^ ]+]] to i8*) to i8 addrspace(4)*) }
+// COMMON: @block_G =  addrspace(1) constant void (i8 addrspace(3)*) addrspace(4)* addrspacecast (void (i8 addrspace(3)*) addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_GLOBAL]] to void (i8 addrspace(3)*) addrspace(1)*) to void (i8 addrspace(3)*) addrspace(4)*)
+
+// For anonymous blocks without captures, emit block literals as global variable.
+// COMMON: [[BLG1:@__opencl_enqueued_block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INVG1:@[^ ]+_kernel]] to i8*) to i8 addrspace(4)*) }
+// COMMON: [[BLG2:@__opencl_enqueued_block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INVG2:@[^ ]+_kernel]] to i8*) to i8 addrspace(4)*) }
+// COMMON: [[BLG3:@__opencl_enqueued_block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INVG3:@[^ ]+_kernel]] to i8*) to i8 addrspace(4)*) }
+// COMMON: [[BLG4:@__opencl_enqueued_block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INVG4:@[^ ]+_kernel]] to i8*) to i8 addrspace(4)*) }
+// COMMON: [[BLG5:@__opencl_enqueued_block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INVG5:@[^ ]+_kernel]] to i8*) to i8 addrspace(4)*) }
+// COMMON: [[BLG6:@__opencl_enqueued_block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*, i8 addrspace(3)*, i8 addrspace(3)*)* [[INVG6:@[^ ]+_kernel]] to i8*) to i8 addrspace(4)*) }
+// COMMON: [[BLG7:@__opencl_enqueued_block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INVG7:@[^ ]+_kernel]] to i8*) to i8 addrspace(4)*) }
+// COMMON: [[BLG8:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* [[INVG8:@[^ ]+]] to i8*) to i8 addrspace(4)*) }
+// COMMON: [[BLG9:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INVG9:@[^ ]+]] to i8*) to i8 addrspace(4)*) }
+// COMMON: [[BLG8K:@__opencl_enqueued_block_literal_global[^ ]*]] = internal addrspace(1) c

[PATCH] D38134: [OpenCL] Emit enqueued block as kernel

2017-10-04 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl marked 10 inline comments as done.
yaxunl added a comment.

In https://reviews.llvm.org/D38134#880133, @Anastasia wrote:

> I think we should add a test case when the same block is both called and 
> enqueued.


Will do.




Comment at: test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl:3
+
+// CHECK: %[[S1:struct.__amdgpu_block_arg_t.*]] = type { [3 x i64], [1 x i8] }
+// CHECK: %[[S2:struct.__amdgpu_block_arg_t.*]] = type { [5 x i64], [1 x i8] }

Anastasia wrote:
> yaxunl wrote:
> > Anastasia wrote:
> > > This struct is not identical to block literal struct?
> > The LLVM type of the first argument of block invoke function is created 
> > directly with sorting and rearrangement. There is no AST type corresponding 
> > to it. However, the function codegen requires AST type of this argument. I 
> > feel it is unnecessary to create the corresponding AST type. For 
> > simplicity, just create an AST type with the same size and alignment as the 
> > LLVM type. In the function code, it will be bitcasted to the correct LLVM 
> > struct type and get the captured variables.
> So `void ptr` won't be possible here? Since it is cast to a right struct 
> inside the block anyway. Once again a block is a special type object with 
> known semantic to compiler and runtime in contract to kernels that can be 
> written with any arbitrary type of arguments.
> 
> I just don't like the idea of duplicating the block invoke function in case 
> it's being both called and enqueued. Also the login in blocks code generation 
> becomes difficult to understand. So I am wondering if we could perhaps create 
> a separate kernel function (as a wrapper) for enqueue_kernel which would call 
> a block instead. What do you think about it? I think the kernel prototype 
> would be fairly generic as it would just have a block call inside and pass 
> the arguments into it... We won't need to modify block generation then at 
> all. 
Will emit a wrapper kernel which calls the block invoke function and keep the 
block invoke function unchanged.


https://reviews.llvm.org/D38134



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D38134: [OpenCL] Emit enqueued block as kernel

2017-09-25 Thread Anastasia Stulova via Phabricator via cfe-commits
Anastasia added a comment.

I think we should add a test case when the same block is both called and 
enqueued.




Comment at: lib/CodeGen/CGOpenCLRuntime.cpp:113
+
+llvm::Value *CGOpenCLRuntime::emitOpenCLEnqueuedBlock(CodeGenFunction &CGF,
+  const Expr *E) {

yaxunl wrote:
> Anastasia wrote:
> > yaxunl wrote:
> > > Anastasia wrote:
> > > > I am not particularly in favour of duplicating CodeGen functionality as 
> > > > it typically has so many special cases that are hard to catch. Is this 
> > > > logic needed in order to pass to block literal information  that the 
> > > > block is enqueued?
> > > This code is needed to emit separate functions for a block which is 
> > > directly called and also enqueued as a kernel. Since the kernel needs to 
> > > have proper calling convention and ABI, it cannot be emitted as the same 
> > > function as when the block is called directly. Since it is OpenCL 
> > > specific code, I found it is cleaner to separate this code as member of 
> > > CGOpenCLRuntime instead of fitting it into CGF.EmitBlockLiteral.
> > This part is replacing standard `EmitScalarExpr` call which is doing 
> > several things before calling into block generation. That's why I am a bit 
> > worried we are covering all the corner cases here.
> > 
> > So if we transform all blocks into kernels unconditionally we won't need 
> > this special handling then?
> > 
> > Do we generate two versions of the blocks now: one for enqueue and one for 
> > call?
> If we transform all blocks into kernels, we could simplify the logic. 
> Probably will not need this special handling.
> 
> However, when the block is called directly, the first argument is a private 
> pointer, when it is executed as a kernel, the first argument is a global 
> pointer or a struct (for amdgpu target), therefore the emitted functions 
> cannot be the same.
Would using generic address space for this first argument not work?



Comment at: test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl:3
+
+// CHECK: %[[S1:struct.__amdgpu_block_arg_t.*]] = type { [3 x i64], [1 x i8] }
+// CHECK: %[[S2:struct.__amdgpu_block_arg_t.*]] = type { [5 x i64], [1 x i8] }

yaxunl wrote:
> Anastasia wrote:
> > This struct is not identical to block literal struct?
> The LLVM type of the first argument of block invoke function is created 
> directly with sorting and rearrangement. There is no AST type corresponding 
> to it. However, the function codegen requires AST type of this argument. I 
> feel it is unnecessary to create the corresponding AST type. For simplicity, 
> just create an AST type with the same size and alignment as the LLVM type. In 
> the function code, it will be bitcasted to the correct LLVM struct type and 
> get the captured variables.
So `void ptr` won't be possible here? Since it is cast to a right struct inside 
the block anyway. Once again a block is a special type object with known 
semantic to compiler and runtime in contract to kernels that can be written 
with any arbitrary type of arguments.

I just don't like the idea of duplicating the block invoke function in case 
it's being both called and enqueued. Also the login in blocks code generation 
becomes difficult to understand. So I am wondering if we could perhaps create a 
separate kernel function (as a wrapper) for enqueue_kernel which would call a 
block instead. What do you think about it? I think the kernel prototype would 
be fairly generic as it would just have a block call inside and pass the 
arguments into it... We won't need to modify block generation then at all. 


https://reviews.llvm.org/D38134



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D38134: [OpenCL] Emit enqueued block as kernel

2017-09-22 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl marked 10 inline comments as done.
yaxunl added inline comments.



Comment at: lib/CodeGen/CGOpenCLRuntime.cpp:113
+
+llvm::Value *CGOpenCLRuntime::emitOpenCLEnqueuedBlock(CodeGenFunction &CGF,
+  const Expr *E) {

Anastasia wrote:
> yaxunl wrote:
> > Anastasia wrote:
> > > I am not particularly in favour of duplicating CodeGen functionality as 
> > > it typically has so many special cases that are hard to catch. Is this 
> > > logic needed in order to pass to block literal information  that the 
> > > block is enqueued?
> > This code is needed to emit separate functions for a block which is 
> > directly called and also enqueued as a kernel. Since the kernel needs to 
> > have proper calling convention and ABI, it cannot be emitted as the same 
> > function as when the block is called directly. Since it is OpenCL specific 
> > code, I found it is cleaner to separate this code as member of 
> > CGOpenCLRuntime instead of fitting it into CGF.EmitBlockLiteral.
> This part is replacing standard `EmitScalarExpr` call which is doing several 
> things before calling into block generation. That's why I am a bit worried we 
> are covering all the corner cases here.
> 
> So if we transform all blocks into kernels unconditionally we won't need this 
> special handling then?
> 
> Do we generate two versions of the blocks now: one for enqueue and one for 
> call?
If we transform all blocks into kernels, we could simplify the logic. Probably 
will not need this special handling.

However, when the block is called directly, the first argument is a private 
pointer, when it is executed as a kernel, the first argument is a global 
pointer or a struct (for amdgpu target), therefore the emitted functions cannot 
be the same.



Comment at: lib/CodeGen/CodeGenFunction.cpp:535
+if (i == 0 && IsBlock) {
+  ty = CGF.CGM.getTargetCodeGenInfo().getEnqueuedBlockArgumentType(
+  ASTCtx, *CGF.BlockInfo);

Anastasia wrote:
> yaxunl wrote:
> > Anastasia wrote:
> > > I don't quite understand why we need to special case this? As far as I 
> > > undertsnad block argument is a `generic void* ` type but it's being cast 
> > > to a concrete block struct inside the block function. Do we gain anything 
> > > from having it a specific type here?
> > This argument is not part of BlockDecl. BlockDecl only has arguments shown 
> > up in the source code. The first argument in the LLVM block invoke function 
> > is generated by codegen and there is no correspondence in AST, so it has to 
> > be handled as a special case.
> Considering that enqueued kernel always takes the same type of the arguments 
> (`local void*`) and # of args is specified in `enqueue_kernel`, I was 
> wondering whether we need to generate the information on the kernel 
> parameters at all? The enqueueing side will have the information provided in 
> the `enqueue_kernel` code.
> 
> As for the block itself it can be passed as `generic void*` and then cast to 
> the block struct type inside the block itself.
amdgpu backend relies on kernel argument metadata to generate some metadata in 
elf for runtime to launch the kernel. The backend expects the kernel argument 
metadata on each kernel argument. Not generating kernel metadata on the first 
kernel argument requires special handling in the backend. I think it is better 
to let Clang generate kernel argument metadata for all kernel arguments.



Comment at: lib/CodeGen/CodeGenFunction.cpp:667
   llvm::MDNode::get(Context, argTypeQuals));
-  if (CGM.getCodeGenOpts().EmitOpenCLArgMetadata)
+  if (CGF.CGM.getCodeGenOpts().EmitOpenCLArgMetadata)
 Fn->setMetadata("kernel_arg_name",

Anastasia wrote:
> Why this change?
CGM is no longer a function parameter since now this function requires a CGF 
parameter.



Comment at: test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl:3
+
+// CHECK: %[[S1:struct.__amdgpu_block_arg_t.*]] = type { [3 x i64], [1 x i8] }
+// CHECK: %[[S2:struct.__amdgpu_block_arg_t.*]] = type { [5 x i64], [1 x i8] }

Anastasia wrote:
> This struct is not identical to block literal struct?
The LLVM type of the first argument of block invoke function is created 
directly with sorting and rearrangement. There is no AST type corresponding to 
it. However, the function codegen requires AST type of this argument. I feel it 
is unnecessary to create the corresponding AST type. For simplicity, just 
create an AST type with the same size and alignment as the LLVM type. In the 
function code, it will be bitcasted to the correct LLVM struct type and get the 
captured variables.


https://reviews.llvm.org/D38134



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D38134: [OpenCL] Emit enqueued block as kernel

2017-09-22 Thread Anastasia Stulova via Phabricator via cfe-commits
Anastasia added a comment.

>> I feel it would be much simpler if we could always generate the kernel 
>> metadata for blocks. A lot of special case code would be removed if we do 
>> this. OpenCL doesn't prevent kernel functions to be used just as normal 
>> functions (6.7.1) so it should be a perfectly valid thing to do. Do you seen 
>> any issues with that?
> 
> The special cases in metadata generation code is due to the first argument of 
> LLVM block invoke function is not defined in BlockDecl. Emitting metadata for 
> all block invoke functions does not help here.

To be more specific. I am just wondering what do we need for blocks to be used 
as kernels pragmatically. I feel it is essentially kernel calling convention 
and kernel metadata? The kernel arguments metadata however can be omitted 
because their type is fixed to be `local void*` and the number of arguments is 
passed into `enqueue_kernel` call so it is known at the enqueueing side too. 
The block descriptor can be passed as a generic pointer `generic void*` as it 
is cast to the right struct type inside the block invoke function anyway. So if 
we do this we can avoid adding a lot of extra code. Because blocks have reduced 
functionality compared to kernel functions. Also OpenCL allows kernel functions 
to be called just as normal functions so this way we can support second use 
case for blocks too. What do you think about it?




Comment at: lib/CodeGen/CGOpenCLRuntime.cpp:113
+
+llvm::Value *CGOpenCLRuntime::emitOpenCLEnqueuedBlock(CodeGenFunction &CGF,
+  const Expr *E) {

yaxunl wrote:
> Anastasia wrote:
> > I am not particularly in favour of duplicating CodeGen functionality as it 
> > typically has so many special cases that are hard to catch. Is this logic 
> > needed in order to pass to block literal information  that the block is 
> > enqueued?
> This code is needed to emit separate functions for a block which is directly 
> called and also enqueued as a kernel. Since the kernel needs to have proper 
> calling convention and ABI, it cannot be emitted as the same function as when 
> the block is called directly. Since it is OpenCL specific code, I found it is 
> cleaner to separate this code as member of CGOpenCLRuntime instead of fitting 
> it into CGF.EmitBlockLiteral.
This part is replacing standard `EmitScalarExpr` call which is doing several 
things before calling into block generation. That's why I am a bit worried we 
are covering all the corner cases here.

So if we transform all blocks into kernels unconditionally we won't need this 
special handling then?

Do we generate two versions of the blocks now: one for enqueue and one for call?



Comment at: lib/CodeGen/CodeGenFunction.cpp:535
+if (i == 0 && IsBlock) {
+  ty = CGF.CGM.getTargetCodeGenInfo().getEnqueuedBlockArgumentType(
+  ASTCtx, *CGF.BlockInfo);

yaxunl wrote:
> Anastasia wrote:
> > I don't quite understand why we need to special case this? As far as I 
> > undertsnad block argument is a `generic void* ` type but it's being cast to 
> > a concrete block struct inside the block function. Do we gain anything from 
> > having it a specific type here?
> This argument is not part of BlockDecl. BlockDecl only has arguments shown up 
> in the source code. The first argument in the LLVM block invoke function is 
> generated by codegen and there is no correspondence in AST, so it has to be 
> handled as a special case.
Considering that enqueued kernel always takes the same type of the arguments 
(`local void*`) and # of args is specified in `enqueue_kernel`, I was wondering 
whether we need to generate the information on the kernel parameters at all? 
The enqueueing side will have the information provided in the `enqueue_kernel` 
code.

As for the block itself it can be passed as `generic void*` and then cast to 
the block struct type inside the block itself.



Comment at: lib/CodeGen/CodeGenFunction.cpp:667
   llvm::MDNode::get(Context, argTypeQuals));
-  if (CGM.getCodeGenOpts().EmitOpenCLArgMetadata)
+  if (CGF.CGM.getCodeGenOpts().EmitOpenCLArgMetadata)
 Fn->setMetadata("kernel_arg_name",

Why this change?



Comment at: test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl:3
+
+// CHECK: %[[S1:struct.__amdgpu_block_arg_t.*]] = type { [3 x i64], [1 x i8] }
+// CHECK: %[[S2:struct.__amdgpu_block_arg_t.*]] = type { [5 x i64], [1 x i8] }

This struct is not identical to block literal struct?


https://reviews.llvm.org/D38134



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D38134: [OpenCL] Emit enqueued block as kernel

2017-09-21 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl marked 3 inline comments as done.
yaxunl added a comment.

In https://reviews.llvm.org/D38134#877831, @Anastasia wrote:

> Now if we have a block which is being called and enqueued at the same time, 
> will we generate 2 functions for it? Could we add such test case btw?


Yes. It is covered by test/CodeGenOpenCL/cl20-device-side-enqueue.cl, line 246, 
250, and 256.

> I feel it would be much simpler if we could always generate the kernel 
> metadata for blocks. A lot of special case code would be removed if we do 
> this. OpenCL doesn't prevent kernel functions to be used just as normal 
> functions (6.7.1) so it should be a perfectly valid thing to do. Do you seen 
> any issues with that?

The special cases in metadata generation code is due to the first argument of 
LLVM block invoke function is not defined in BlockDecl. Emitting metadata for 
all block invoke functions does not help here.




Comment at: lib/CodeGen/CGBlocks.cpp:1255
 // Allocate a stack slot to let the debug info survive the RA.
-Address alloc = CreateMemTemp(D->getType(), D->getName() + ".addr");
+Address alloc = CreateMemTemp(
+!PV.isIndirect() ? D->getType()

Anastasia wrote:
> Is there any test that covers this?
Yes. This is covered by test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl where the 
block struct is passed directly to the kernel instead of by a pointer.



Comment at: lib/CodeGen/CGOpenCLRuntime.cpp:113
+
+llvm::Value *CGOpenCLRuntime::emitOpenCLEnqueuedBlock(CodeGenFunction &CGF,
+  const Expr *E) {

Anastasia wrote:
> I am not particularly in favour of duplicating CodeGen functionality as it 
> typically has so many special cases that are hard to catch. Is this logic 
> needed in order to pass to block literal information  that the block is 
> enqueued?
This code is needed to emit separate functions for a block which is directly 
called and also enqueued as a kernel. Since the kernel needs to have proper 
calling convention and ABI, it cannot be emitted as the same function as when 
the block is called directly. Since it is OpenCL specific code, I found it is 
cleaner to separate this code as member of CGOpenCLRuntime instead of fitting 
it into CGF.EmitBlockLiteral.



Comment at: lib/CodeGen/CodeGenFunction.cpp:535
+if (i == 0 && IsBlock) {
+  ty = CGF.CGM.getTargetCodeGenInfo().getEnqueuedBlockArgumentType(
+  ASTCtx, *CGF.BlockInfo);

Anastasia wrote:
> I don't quite understand why we need to special case this? As far as I 
> undertsnad block argument is a `generic void* ` type but it's being cast to a 
> concrete block struct inside the block function. Do we gain anything from 
> having it a specific type here?
This argument is not part of BlockDecl. BlockDecl only has arguments shown up 
in the source code. The first argument in the LLVM block invoke function is 
generated by codegen and there is no correspondence in AST, so it has to be 
handled as a special case.


https://reviews.llvm.org/D38134



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D38134: [OpenCL] Emit enqueued block as kernel

2017-09-21 Thread Anastasia Stulova via Phabricator via cfe-commits
Anastasia added a comment.

Now if we have a block which is being called and enqueued at the same time, 
will we generate 2 functions for it? Could we add such test case btw?

I feel it would be much simpler if we could always generate the kernel metadata 
for blocks. A lot of special case code would be removed if we do this. OpenCL 
doesn't prevent kernel functions to be used just as normal functions (6.7.1) so 
it should be a perfectly valid thing to do. Do you seen any issues with that?




Comment at: lib/CodeGen/CGBlocks.cpp:1255
 // Allocate a stack slot to let the debug info survive the RA.
-Address alloc = CreateMemTemp(D->getType(), D->getName() + ".addr");
+Address alloc = CreateMemTemp(
+!PV.isIndirect() ? D->getType()

Is there any test that covers this?



Comment at: lib/CodeGen/CGOpenCLRuntime.cpp:113
+
+llvm::Value *CGOpenCLRuntime::emitOpenCLEnqueuedBlock(CodeGenFunction &CGF,
+  const Expr *E) {

I am not particularly in favour of duplicating CodeGen functionality as it 
typically has so many special cases that are hard to catch. Is this logic 
needed in order to pass to block literal information  that the block is 
enqueued?



Comment at: lib/CodeGen/CodeGenFunction.cpp:535
+if (i == 0 && IsBlock) {
+  ty = CGF.CGM.getTargetCodeGenInfo().getEnqueuedBlockArgumentType(
+  ASTCtx, *CGF.BlockInfo);

I don't quite understand why we need to special case this? As far as I 
undertsnad block argument is a `generic void* ` type but it's being cast to a 
concrete block struct inside the block function. Do we gain anything from 
having it a specific type here?


https://reviews.llvm.org/D38134



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D38134: [OpenCL] Emit enqueued block as kernel

2017-09-21 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl created this revision.
Herald added a subscriber: nhaehnle.

In OpenCL the kernel function and non-kernel function has different calling 
conventions.
For certain targets they have different argument ABIs. Also kernels have 
special function
attributes and metadata for runtime to launch them.

The blocks passed to enqueue_kernel is supposed to be executed as kernels. As 
such,
the block invoke function should be emitted as kernel with proper calling 
convention,
argument ABI, function attributes and metadata.

However, currently Clang does not emit enqueued blocks as kernels, which causes 
issues
such as address space of captured variables.

This patch emits enqueued block as kernel. If a block is both called directly 
and passed
to enqueue_kernel, separate functions will be generated.


https://reviews.llvm.org/D38134

Files:
  lib/CodeGen/CGBlocks.cpp
  lib/CodeGen/CGBlocks.h
  lib/CodeGen/CGBuiltin.cpp
  lib/CodeGen/CGCall.cpp
  lib/CodeGen/CGDecl.cpp
  lib/CodeGen/CGOpenCLRuntime.cpp
  lib/CodeGen/CGOpenCLRuntime.h
  lib/CodeGen/CodeGenFunction.cpp
  lib/CodeGen/CodeGenFunction.h
  lib/CodeGen/CodeGenTypes.h
  lib/CodeGen/TargetInfo.cpp
  lib/CodeGen/TargetInfo.h
  test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
  test/CodeGenOpenCL/blocks.cl
  test/CodeGenOpenCL/cl20-device-side-enqueue.cl

Index: test/CodeGenOpenCL/cl20-device-side-enqueue.cl
===
--- test/CodeGenOpenCL/cl20-device-side-enqueue.cl
+++ test/CodeGenOpenCL/cl20-device-side-enqueue.cl
@@ -6,10 +6,33 @@
 typedef void (^bl_t)(local void *);
 typedef struct {int a;} ndrange_t;
 
-// N.B. The check here only exists to set BL_GLOBAL
-// COMMON: @block_G =  addrspace(1) constant void (i8 addrspace(3)*) addrspace(4)* addrspacecast (void (i8 addrspace(3)*) addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_GLOBAL:@__block_literal_global(\.[0-9]+)?]] to void (i8 addrspace(3)*) addrspace(1)*) to void (i8 addrspace(3)*) addrspace(4)*)
+// COMMON: %struct.__opencl_block_literal_generic = type { i32, i32, i8 addrspace(4)* }
+
+// For a block global variable, first emit the block literal as a global variable, then emit the block variable itself.
+// COMMON: [[BL_GLOBAL:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INV_G:@[^ ]+]] to i8*) to i8 addrspace(4)*) }
+// COMMON: @block_G =  addrspace(1) constant void (i8 addrspace(3)*) addrspace(4)* addrspacecast (void (i8 addrspace(3)*) addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_GLOBAL]] to void (i8 addrspace(3)*) addrspace(1)*) to void (i8 addrspace(3)*) addrspace(4)*)
+
+// For anonymous blocks without captures, emit block literals as global variable.
+// COMMON: [[BLG1:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(1)*, i8 addrspace(3)*)* [[INVG1:@[^ ]+]] to i8*) to i8 addrspace(4)*) }
+// COMMON: [[BLG2:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(1)*, i8 addrspace(3)*)* [[INVG2:@[^ ]+]] to i8*) to i8 addrspace(4)*) }
+// COMMON: [[BLG3:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(1)*, i8 addrspace(3)*)* [[INVG3:@[^ ]+]] to i8*) to i8 addrspace(4)*) }
+// COMMON: [[BLG4:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(1)*, i8 addrspace(3)*)* [[INVG4:@[^ ]+]] to i8*) to i8 addrspace(4)*) }
+// COMMON: [[BLG5:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(1)*, i8 addrspace(3)*)* [[INVG5:@[^ ]+]] to i8*) to i8 addrspace(4)*) }
+// COMMON: [[BLG6:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(1)*, i8 addrspace(3)*, i8 addrspace(3)*, i8 addrspace(3)*)* [[INVG6:@[^ ]+]] to i8*) to i8 addrspace(4)*) }
+// COMMON: [[BLG7:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(1)*, i8 addrspace(3)*)* [[INVG7:@[^ ]+]] to i8*) to i8 addrspace(4)*) }
+// COMMON: [[BLG8:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i