[PATCH] D38134: [OpenCL] Emit enqueued block as kernel
This revision was automatically updated to reflect the committed changes. Closed by commit rL315804: [OpenCL] Emit enqueued block as kernel (authored by yaxunl). Changed prior to commit: https://reviews.llvm.org/D38134?vs=118795&id=119017#toc Repository: rL LLVM https://reviews.llvm.org/D38134 Files: cfe/trunk/lib/CodeGen/CGBlocks.cpp cfe/trunk/lib/CodeGen/CGBuiltin.cpp cfe/trunk/lib/CodeGen/CGOpenCLRuntime.cpp cfe/trunk/lib/CodeGen/CGOpenCLRuntime.h cfe/trunk/lib/CodeGen/CodeGenFunction.h cfe/trunk/lib/CodeGen/CodeGenTypes.h cfe/trunk/lib/CodeGen/TargetInfo.cpp cfe/trunk/lib/CodeGen/TargetInfo.h cfe/trunk/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl cfe/trunk/test/CodeGenOpenCL/blocks.cl cfe/trunk/test/CodeGenOpenCL/cl20-device-side-enqueue.cl Index: cfe/trunk/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl === --- cfe/trunk/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl +++ cfe/trunk/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl @@ -0,0 +1,36 @@ +// RUN: %clang_cc1 %s -cl-std=CL2.0 -O0 -emit-llvm -o - -triple amdgcn | FileCheck %s --check-prefix=CHECK + +typedef struct {int a;} ndrange_t; + +// CHECK-LABEL: define amdgpu_kernel void @test +kernel void test(global char *a, char b, global long *c, long d) { + queue_t default_queue; + unsigned flags = 0; + ndrange_t ndrange; + + enqueue_kernel(default_queue, flags, ndrange, + ^(void) { + a[0] = b; + }); + + enqueue_kernel(default_queue, flags, ndrange, + ^(void) { + a[0] = b; + c[0] = d; + }); +} + +// CHECK-LABEL: define internal amdgpu_kernel void @__test_block_invoke_kernel(<{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>) +// CHECK-SAME: #[[ATTR:[0-9]+]] !kernel_arg_addr_space !{{.*}} !kernel_arg_access_qual !{{.*}} !kernel_arg_type !{{.*}} !kernel_arg_base_type !{{.*}} !kernel_arg_type_qual !{{.*}} +// CHECK: entry: +// CHECK: %1 = alloca <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>, align 8 +// CHECK: store <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }> %0, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>* %1, align 8 +// CHECK: %2 = addrspacecast <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>* %1 to i8 addrspace(4)* +// CHECK: call void @__test_block_invoke(i8 addrspace(4)* %2) +// CHECK: ret void +// CHECK:} + +// CHECK-LABEL: define internal amdgpu_kernel void @__test_block_invoke_2_kernel(<{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>) +// CHECK-SAME: #[[ATTR]] !kernel_arg_addr_space !{{.*}} !kernel_arg_access_qual !{{.*}} !kernel_arg_type !{{.*}} !kernel_arg_base_type !{{.*}} !kernel_arg_type_qual !{{.*}} + +// CHECK: attributes #[[ATTR]] = { nounwind "enqueued-block" } Index: cfe/trunk/test/CodeGenOpenCL/cl20-device-side-enqueue.cl === --- cfe/trunk/test/CodeGenOpenCL/cl20-device-side-enqueue.cl +++ cfe/trunk/test/CodeGenOpenCL/cl20-device-side-enqueue.cl @@ -6,10 +6,30 @@ typedef void (^bl_t)(local void *); typedef struct {int a;} ndrange_t; -// N.B. The check here only exists to set BL_GLOBAL -// COMMON: @block_G = addrspace(1) constant void (i8 addrspace(3)*) addrspace(4)* addrspacecast (void (i8 addrspace(3)*) addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_GLOBAL:@__block_literal_global(\.[0-9]+)?]] to void (i8 addrspace(3)*) addrspace(1)*) to void (i8 addrspace(3)*) addrspace(4)*) +// COMMON: %struct.__opencl_block_literal_generic = type { i32, i32, i8 addrspace(4)* } + +// For a block global variable, first emit the block literal as a global variable, then emit the block variable itself. +// COMMON: [[BL_GLOBAL:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INV_G:@[^ ]+]] to i8*) to i8 addrspace(4)*) } +// COMMON: @block_G = addrspace(1) constant void (i8 addrspace(3)*) addrspace(4)* addrspacecast (void (i8 addrspace(3)*) addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_GLOBAL]] to void (i8 addrspace(3)*) addrspace(1)*) to void (i8 addrspace(3)*) addrspace(4)*) + +// For anonymous blocks without captures, emit block literals as global variable. +// COMMON: [[BLG1:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* {{@[^ ]+}} to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG2:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* {{@[^ ]+}} to i8*) to i8 addrspa
[PATCH] D38134: [OpenCL] Emit enqueued block as kernel
Anastasia accepted this revision. Anastasia added a comment. This revision is now accepted and ready to land. LGTM! Great work! Thanks! https://reviews.llvm.org/D38134 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D38134: [OpenCL] Emit enqueued block as kernel
yaxunl added a comment. In https://reviews.llvm.org/D38134#895848, @Anastasia wrote: > I think it would be good to add a block test to CodeGenOpenCL where we would > just call the block without any enqueue and check that the invoke function is > generated but the kernel wrapper isn't. we have test/CodeGenOpenCL/blocks.cl which only calls a block. I can add check to make sure no kernels generated. Comment at: lib/CodeGen/CGBuiltin.cpp:2846 + PtrToSizeArray}; + std::vector ArgTys = {QueueTy, + IntTy, Anastasia wrote: > Formatting seems inconsistent from above. Will fix. Comment at: lib/CodeGen/CodeGenFunction.h:2921 private: - /// Helpers for blocks - llvm::Value *EmitBlockLiteral(const CGBlockInfo &Info); + /// Helpers for blocks. Returns invoke function by \p InvokeF if it is not + /// nullptr. Anastasia wrote: > It will be nullptr in case block is not enqueued? May be it's worth > explaining it in the comment. Will do. Comment at: lib/CodeGen/TargetInfo.h:290 } + /// Create an OpenCL kernel for an enqueued block. + virtual llvm::Function * Anastasia wrote: > Can we also explain the wrapper kernel here? Will do. https://reviews.llvm.org/D38134 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D38134: [OpenCL] Emit enqueued block as kernel
yaxunl updated this revision to Diff 118795. yaxunl marked 5 inline comments as done. yaxunl added a comment. Revised by Anastasia's comments. https://reviews.llvm.org/D38134 Files: lib/CodeGen/CGBlocks.cpp lib/CodeGen/CGBuiltin.cpp lib/CodeGen/CGOpenCLRuntime.cpp lib/CodeGen/CGOpenCLRuntime.h lib/CodeGen/CodeGenFunction.h lib/CodeGen/CodeGenTypes.h lib/CodeGen/TargetInfo.cpp lib/CodeGen/TargetInfo.h test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl test/CodeGenOpenCL/blocks.cl test/CodeGenOpenCL/cl20-device-side-enqueue.cl Index: test/CodeGenOpenCL/cl20-device-side-enqueue.cl === --- test/CodeGenOpenCL/cl20-device-side-enqueue.cl +++ test/CodeGenOpenCL/cl20-device-side-enqueue.cl @@ -6,10 +6,30 @@ typedef void (^bl_t)(local void *); typedef struct {int a;} ndrange_t; -// N.B. The check here only exists to set BL_GLOBAL -// COMMON: @block_G = addrspace(1) constant void (i8 addrspace(3)*) addrspace(4)* addrspacecast (void (i8 addrspace(3)*) addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_GLOBAL:@__block_literal_global(\.[0-9]+)?]] to void (i8 addrspace(3)*) addrspace(1)*) to void (i8 addrspace(3)*) addrspace(4)*) +// COMMON: %struct.__opencl_block_literal_generic = type { i32, i32, i8 addrspace(4)* } + +// For a block global variable, first emit the block literal as a global variable, then emit the block variable itself. +// COMMON: [[BL_GLOBAL:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INV_G:@[^ ]+]] to i8*) to i8 addrspace(4)*) } +// COMMON: @block_G = addrspace(1) constant void (i8 addrspace(3)*) addrspace(4)* addrspacecast (void (i8 addrspace(3)*) addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_GLOBAL]] to void (i8 addrspace(3)*) addrspace(1)*) to void (i8 addrspace(3)*) addrspace(4)*) + +// For anonymous blocks without captures, emit block literals as global variable. +// COMMON: [[BLG1:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* {{@[^ ]+}} to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG2:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* {{@[^ ]+}} to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG3:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* {{@[^ ]+}} to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG4:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* {{@[^ ]+}} to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG5:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* {{@[^ ]+}} to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG6:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*, i8 addrspace(3)*, i8 addrspace(3)*)* {{@[^ ]+}} to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG7:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* {{@[^ ]+}} to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG8:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* [[INVG8:@[^ ]+]] to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG9:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INVG9:@[^ ]+]] to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG10:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* {{@[^ ]+}} to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG11:@__block_literal_global[^ ]*]] = internal addrspace(1) constant {
[PATCH] D38134: [OpenCL] Emit enqueued block as kernel
Anastasia added a comment. I think it would be good to add a block test to CodeGenOpenCL where we would just call the block without any enqueue and check that the invoke function is generated but the kernel wrapper isn't. Comment at: lib/CodeGen/CGBuiltin.cpp:2846 + PtrToSizeArray}; + std::vector ArgTys = {QueueTy, + IntTy, Formatting seems inconsistent from above. Comment at: lib/CodeGen/CodeGenFunction.h:2921 private: - /// Helpers for blocks - llvm::Value *EmitBlockLiteral(const CGBlockInfo &Info); + /// Helpers for blocks. Returns invoke function by \p InvokeF if it is not + /// nullptr. It will be nullptr in case block is not enqueued? May be it's worth explaining it in the comment. Comment at: lib/CodeGen/TargetInfo.h:290 } + /// Create an OpenCL kernel for an enqueued block. + virtual llvm::Function * Can we also explain the wrapper kernel here? https://reviews.llvm.org/D38134 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D38134: [OpenCL] Emit enqueued block as kernel
yaxunl updated this revision to Diff 118677. yaxunl marked 2 inline comments as done. yaxunl added a comment. Revised by Anastasia's comments. Get block invoke function by API instead of iterate through IR's. Pass the block kernel directly to `__enqueu_kernel functions`. https://reviews.llvm.org/D38134 Files: lib/CodeGen/CGBlocks.cpp lib/CodeGen/CGBuiltin.cpp lib/CodeGen/CGOpenCLRuntime.cpp lib/CodeGen/CGOpenCLRuntime.h lib/CodeGen/CodeGenFunction.h lib/CodeGen/CodeGenTypes.h lib/CodeGen/TargetInfo.cpp lib/CodeGen/TargetInfo.h test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl test/CodeGenOpenCL/cl20-device-side-enqueue.cl Index: test/CodeGenOpenCL/cl20-device-side-enqueue.cl === --- test/CodeGenOpenCL/cl20-device-side-enqueue.cl +++ test/CodeGenOpenCL/cl20-device-side-enqueue.cl @@ -6,10 +6,30 @@ typedef void (^bl_t)(local void *); typedef struct {int a;} ndrange_t; -// N.B. The check here only exists to set BL_GLOBAL -// COMMON: @block_G = addrspace(1) constant void (i8 addrspace(3)*) addrspace(4)* addrspacecast (void (i8 addrspace(3)*) addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_GLOBAL:@__block_literal_global(\.[0-9]+)?]] to void (i8 addrspace(3)*) addrspace(1)*) to void (i8 addrspace(3)*) addrspace(4)*) +// COMMON: %struct.__opencl_block_literal_generic = type { i32, i32, i8 addrspace(4)* } + +// For a block global variable, first emit the block literal as a global variable, then emit the block variable itself. +// COMMON: [[BL_GLOBAL:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INV_G:@[^ ]+]] to i8*) to i8 addrspace(4)*) } +// COMMON: @block_G = addrspace(1) constant void (i8 addrspace(3)*) addrspace(4)* addrspacecast (void (i8 addrspace(3)*) addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_GLOBAL]] to void (i8 addrspace(3)*) addrspace(1)*) to void (i8 addrspace(3)*) addrspace(4)*) + +// For anonymous blocks without captures, emit block literals as global variable. +// COMMON: [[BLG1:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* {{@[^ ]+}} to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG2:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* {{@[^ ]+}} to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG3:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* {{@[^ ]+}} to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG4:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* {{@[^ ]+}} to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG5:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* {{@[^ ]+}} to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG6:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*, i8 addrspace(3)*, i8 addrspace(3)*)* {{@[^ ]+}} to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG7:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* {{@[^ ]+}} to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG8:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* [[INVG8:@[^ ]+]] to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG9:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INVG9:@[^ ]+]] to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG10:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* {{@[^ ]+}} to i8*) to i8 add
[PATCH] D38134: [OpenCL] Emit enqueued block as kernel
yaxunl marked 4 inline comments as done. yaxunl added inline comments. Comment at: lib/CodeGen/CGOpenCLRuntime.cpp:144 + if (auto *I = dyn_cast(V)) { +// If the block literal is emitted as an instruction, it is an alloca +// and the block invoke function is stored to GEP of this alloca. Anastasia wrote: > yaxunl wrote: > > Anastasia wrote: > > > Why do we need to replace original block calls with the kernels? I think > > > in case of calling a block we could use the original block function and > > > only for enqueue use the kernel that would call the block function > > > inside. The pointer to the kernel wrapper could be passed as an > > > additional parameter to `enqueue_kernel` calls. We won't need to iterate > > > through all IR then. > > `CGF.EmitScalarExpr(Block)` returns the block literal structure which > > contains the size/align/invoke_function/captures. The block invoke function > > is stored to the struct by a `StoreInst`. To create the wrapper kernel, we > > need to get the block invoke function, therefore we have to iterate through > > IR. > > > > Since we need to find the store instruction any way, it is simpler to just > > replace the stored function with the kernel and pass the block literal > > struct, instead of passing the kernel separately. > So we cann't get the invoke function from the block literal structure passed > into the kernel wrapper directly knowing its offset? Iterating through IR > adds extra time and also I am not sure how reliable this is wrt different > corner cases of IR. Unfortunately the invoke function is not returned directly. Instead, it is buried in an LLVM value. And to extract the invoke function from the LLVM value we have to wade through a bunch of LLVM IRs. There is one way to get the invoke function directly instead of going through IRs, but we need to change the functions for generating code for the blocks a little bit so that they return the block invoke function. https://reviews.llvm.org/D38134 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D38134: [OpenCL] Emit enqueued block as kernel
Anastasia added inline comments. Comment at: lib/CodeGen/CGOpenCLRuntime.cpp:144 + if (auto *I = dyn_cast(V)) { +// If the block literal is emitted as an instruction, it is an alloca +// and the block invoke function is stored to GEP of this alloca. yaxunl wrote: > Anastasia wrote: > > Why do we need to replace original block calls with the kernels? I think in > > case of calling a block we could use the original block function and only > > for enqueue use the kernel that would call the block function inside. The > > pointer to the kernel wrapper could be passed as an additional parameter to > > `enqueue_kernel` calls. We won't need to iterate through all IR then. > `CGF.EmitScalarExpr(Block)` returns the block literal structure which > contains the size/align/invoke_function/captures. The block invoke function > is stored to the struct by a `StoreInst`. To create the wrapper kernel, we > need to get the block invoke function, therefore we have to iterate through > IR. > > Since we need to find the store instruction any way, it is simpler to just > replace the stored function with the kernel and pass the block literal > struct, instead of passing the kernel separately. So we cann't get the invoke function from the block literal structure passed into the kernel wrapper directly knowing its offset? Iterating through IR adds extra time and also I am not sure how reliable this is wrt different corner cases of IR. Comment at: lib/CodeGen/TargetInfo.cpp:8949 + Builder.restoreIP(IP); + return F; +} yaxunl wrote: > Anastasia wrote: > > Wondering if we should add the kernel metadata (w/o args) since it was used > > for long time to indicate the kernel. > Currently (before this change), clang already does not generate kernel > metadata if there is no vec_type_hint, work_group_size_hint, > reqd_work_group_size. Remember last time we made the change to use function > metadata to represent these attributes. Whether a function is a kernel can be > determined by its calling convention. Ok, let's leave it for now. We can always add it in on request. Comment at: test/CodeGenOpenCL/cl20-device-side-enqueue.cl:297 +// COMMON: define internal spir_kernel void [[INVG5]](i8 addrspace(4)*{{.*}}, i8 addrspace(3)*{{.*}}) +// COMMON: define internal spir_kernel void [[INVG6]](i8 addrspace(4)*{{.*}}, i8 addrspace(3)*{{.*}}, i8 addrspace(3)*{{.*}}, i8 addrspace(3)*{{.*}}) +// COMMON: define internal spir_kernel void [[INVG7]](i8 addrspace(4)*{{.*}}, i8 addrspace(3)*{{.*}}) Perhaps we could check the body of this one too since it has a different prototype. https://reviews.llvm.org/D38134 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D38134: [OpenCL] Emit enqueued block as kernel
yaxunl updated this revision to Diff 118064. yaxunl marked 5 inline comments as done. yaxunl added a comment. Revise by Anastasia's comments. https://reviews.llvm.org/D38134 Files: lib/CodeGen/CGBuiltin.cpp lib/CodeGen/CGOpenCLRuntime.cpp lib/CodeGen/CGOpenCLRuntime.h lib/CodeGen/CodeGenTypes.h lib/CodeGen/TargetInfo.cpp lib/CodeGen/TargetInfo.h test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl test/CodeGenOpenCL/cl20-device-side-enqueue.cl Index: test/CodeGenOpenCL/cl20-device-side-enqueue.cl === --- test/CodeGenOpenCL/cl20-device-side-enqueue.cl +++ test/CodeGenOpenCL/cl20-device-side-enqueue.cl @@ -6,10 +6,33 @@ typedef void (^bl_t)(local void *); typedef struct {int a;} ndrange_t; -// N.B. The check here only exists to set BL_GLOBAL -// COMMON: @block_G = addrspace(1) constant void (i8 addrspace(3)*) addrspace(4)* addrspacecast (void (i8 addrspace(3)*) addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_GLOBAL:@__block_literal_global(\.[0-9]+)?]] to void (i8 addrspace(3)*) addrspace(1)*) to void (i8 addrspace(3)*) addrspace(4)*) +// COMMON: %struct.__opencl_block_literal_generic = type { i32, i32, i8 addrspace(4)* } + +// For a block global variable, first emit the block literal as a global variable, then emit the block variable itself. +// COMMON: [[BL_GLOBAL:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INV_G:@[^ ]+]] to i8*) to i8 addrspace(4)*) } +// COMMON: @block_G = addrspace(1) constant void (i8 addrspace(3)*) addrspace(4)* addrspacecast (void (i8 addrspace(3)*) addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_GLOBAL]] to void (i8 addrspace(3)*) addrspace(1)*) to void (i8 addrspace(3)*) addrspace(4)*) + +// For anonymous blocks without captures, emit block literals as global variable. +// COMMON: [[BLG1:@__opencl_enqueued_block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INVG1:@[^ ]+_kernel]] to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG2:@__opencl_enqueued_block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INVG2:@[^ ]+_kernel]] to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG3:@__opencl_enqueued_block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INVG3:@[^ ]+_kernel]] to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG4:@__opencl_enqueued_block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INVG4:@[^ ]+_kernel]] to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG5:@__opencl_enqueued_block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INVG5:@[^ ]+_kernel]] to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG6:@__opencl_enqueued_block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*, i8 addrspace(3)*, i8 addrspace(3)*)* [[INVG6:@[^ ]+_kernel]] to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG7:@__opencl_enqueued_block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INVG7:@[^ ]+_kernel]] to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG8:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* [[INVG8:@[^ ]+]] to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG9:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INVG9:@[^ ]+]] to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG8K:@__opencl_enqueued_block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)*
[PATCH] D38134: [OpenCL] Emit enqueued block as kernel
yaxunl marked 6 inline comments as done. yaxunl added inline comments. Comment at: lib/CodeGen/CGOpenCLRuntime.cpp:144 + if (auto *I = dyn_cast(V)) { +// If the block literal is emitted as an instruction, it is an alloca +// and the block invoke function is stored to GEP of this alloca. Anastasia wrote: > Why do we need to replace original block calls with the kernels? I think in > case of calling a block we could use the original block function and only for > enqueue use the kernel that would call the block function inside. The pointer > to the kernel wrapper could be passed as an additional parameter to > `enqueue_kernel` calls. We won't need to iterate through all IR then. `CGF.EmitScalarExpr(Block)` returns the block literal structure which contains the size/align/invoke_function/captures. The block invoke function is stored to the struct by a `StoreInst`. To create the wrapper kernel, we need to get the block invoke function, therefore we have to iterate through IR. Since we need to find the store instruction any way, it is simpler to just replace the stored function with the kernel and pass the block literal struct, instead of passing the kernel separately. Comment at: lib/CodeGen/TargetInfo.cpp:8927 +llvm::Function * +TargetCodeGenInfo::createEnqueuedBlockKernel(CodeGenFunction &CGF, + llvm::Function *Invoke, Anastasia wrote: > Could you add some comments please? Will do. Comment at: lib/CodeGen/TargetInfo.cpp:8949 + Builder.restoreIP(IP); + return F; +} Anastasia wrote: > Wondering if we should add the kernel metadata (w/o args) since it was used > for long time to indicate the kernel. Currently (before this change), clang already does not generate kernel metadata if there is no vec_type_hint, work_group_size_hint, reqd_work_group_size. Remember last time we made the change to use function metadata to represent these attributes. Whether a function is a kernel can be determined by its calling convention. Comment at: lib/CodeGen/TargetInfo.h:35 class Decl; +class ASTContext; Anastasia wrote: > Do we need this? Will remove it. Comment at: test/CodeGenOpenCL/cl20-device-side-enqueue.cl:9 -// N.B. The check here only exists to set BL_GLOBAL -// COMMON: @block_G = addrspace(1) constant void (i8 addrspace(3)*) addrspace(4)* addrspacecast (void (i8 addrspace(3)*) addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_GLOBAL:@__block_literal_global(\.[0-9]+)?]] to void (i8 addrspace(3)*) addrspace(1)*) to void (i8 addrspace(3)*) addrspace(4)*) +// COMMON: %struct.__opencl_block_literal_generic = type { i32, i32, i8 addrspace(4)* } + Anastasia wrote: > Can we check generated kernel function too? will do. https://reviews.llvm.org/D38134 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D38134: [OpenCL] Emit enqueued block as kernel
Anastasia added inline comments. Comment at: lib/CodeGen/CGOpenCLRuntime.cpp:144 + if (auto *I = dyn_cast(V)) { +// If the block literal is emitted as an instruction, it is an alloca +// and the block invoke function is stored to GEP of this alloca. Why do we need to replace original block calls with the kernels? I think in case of calling a block we could use the original block function and only for enqueue use the kernel that would call the block function inside. The pointer to the kernel wrapper could be passed as an additional parameter to `enqueue_kernel` calls. We won't need to iterate through all IR then. Comment at: lib/CodeGen/TargetInfo.cpp:8927 +llvm::Function * +TargetCodeGenInfo::createEnqueuedBlockKernel(CodeGenFunction &CGF, + llvm::Function *Invoke, Could you add some comments please? Comment at: lib/CodeGen/TargetInfo.cpp:8949 + Builder.restoreIP(IP); + return F; +} Wondering if we should add the kernel metadata (w/o args) since it was used for long time to indicate the kernel. Comment at: lib/CodeGen/TargetInfo.h:35 class Decl; +class ASTContext; Do we need this? Comment at: test/CodeGenOpenCL/cl20-device-side-enqueue.cl:9 -// N.B. The check here only exists to set BL_GLOBAL -// COMMON: @block_G = addrspace(1) constant void (i8 addrspace(3)*) addrspace(4)* addrspacecast (void (i8 addrspace(3)*) addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_GLOBAL:@__block_literal_global(\.[0-9]+)?]] to void (i8 addrspace(3)*) addrspace(1)*) to void (i8 addrspace(3)*) addrspace(4)*) +// COMMON: %struct.__opencl_block_literal_generic = type { i32, i32, i8 addrspace(4)* } + Can we check generated kernel function too? https://reviews.llvm.org/D38134 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D38134: [OpenCL] Emit enqueued block as kernel
yaxunl updated this revision to Diff 117739. yaxunl marked an inline comment as done. yaxunl edited the summary of this revision. yaxunl added a comment. Emit enqueued block as a wrapper kernel which calls the block invoke function. Added test for calling and enqueue the same block. https://reviews.llvm.org/D38134 Files: lib/CodeGen/CGBuiltin.cpp lib/CodeGen/CGOpenCLRuntime.cpp lib/CodeGen/CGOpenCLRuntime.h lib/CodeGen/CodeGenTypes.h lib/CodeGen/TargetInfo.cpp lib/CodeGen/TargetInfo.h test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl test/CodeGenOpenCL/cl20-device-side-enqueue.cl Index: test/CodeGenOpenCL/cl20-device-side-enqueue.cl === --- test/CodeGenOpenCL/cl20-device-side-enqueue.cl +++ test/CodeGenOpenCL/cl20-device-side-enqueue.cl @@ -6,10 +6,33 @@ typedef void (^bl_t)(local void *); typedef struct {int a;} ndrange_t; -// N.B. The check here only exists to set BL_GLOBAL -// COMMON: @block_G = addrspace(1) constant void (i8 addrspace(3)*) addrspace(4)* addrspacecast (void (i8 addrspace(3)*) addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_GLOBAL:@__block_literal_global(\.[0-9]+)?]] to void (i8 addrspace(3)*) addrspace(1)*) to void (i8 addrspace(3)*) addrspace(4)*) +// COMMON: %struct.__opencl_block_literal_generic = type { i32, i32, i8 addrspace(4)* } + +// For a block global variable, first emit the block literal as a global variable, then emit the block variable itself. +// COMMON: [[BL_GLOBAL:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INV_G:@[^ ]+]] to i8*) to i8 addrspace(4)*) } +// COMMON: @block_G = addrspace(1) constant void (i8 addrspace(3)*) addrspace(4)* addrspacecast (void (i8 addrspace(3)*) addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_GLOBAL]] to void (i8 addrspace(3)*) addrspace(1)*) to void (i8 addrspace(3)*) addrspace(4)*) + +// For anonymous blocks without captures, emit block literals as global variable. +// COMMON: [[BLG1:@__opencl_enqueued_block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INVG1:@[^ ]+_kernel]] to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG2:@__opencl_enqueued_block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INVG2:@[^ ]+_kernel]] to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG3:@__opencl_enqueued_block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INVG3:@[^ ]+_kernel]] to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG4:@__opencl_enqueued_block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INVG4:@[^ ]+_kernel]] to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG5:@__opencl_enqueued_block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INVG5:@[^ ]+_kernel]] to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG6:@__opencl_enqueued_block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*, i8 addrspace(3)*, i8 addrspace(3)*)* [[INVG6:@[^ ]+_kernel]] to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG7:@__opencl_enqueued_block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INVG7:@[^ ]+_kernel]] to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG8:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* [[INVG8:@[^ ]+]] to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG9:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INVG9:@[^ ]+]] to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG8K:@__opencl_enqueued_block_literal_global[^ ]*]] = internal addrspace(1) c
[PATCH] D38134: [OpenCL] Emit enqueued block as kernel
yaxunl marked 10 inline comments as done. yaxunl added a comment. In https://reviews.llvm.org/D38134#880133, @Anastasia wrote: > I think we should add a test case when the same block is both called and > enqueued. Will do. Comment at: test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl:3 + +// CHECK: %[[S1:struct.__amdgpu_block_arg_t.*]] = type { [3 x i64], [1 x i8] } +// CHECK: %[[S2:struct.__amdgpu_block_arg_t.*]] = type { [5 x i64], [1 x i8] } Anastasia wrote: > yaxunl wrote: > > Anastasia wrote: > > > This struct is not identical to block literal struct? > > The LLVM type of the first argument of block invoke function is created > > directly with sorting and rearrangement. There is no AST type corresponding > > to it. However, the function codegen requires AST type of this argument. I > > feel it is unnecessary to create the corresponding AST type. For > > simplicity, just create an AST type with the same size and alignment as the > > LLVM type. In the function code, it will be bitcasted to the correct LLVM > > struct type and get the captured variables. > So `void ptr` won't be possible here? Since it is cast to a right struct > inside the block anyway. Once again a block is a special type object with > known semantic to compiler and runtime in contract to kernels that can be > written with any arbitrary type of arguments. > > I just don't like the idea of duplicating the block invoke function in case > it's being both called and enqueued. Also the login in blocks code generation > becomes difficult to understand. So I am wondering if we could perhaps create > a separate kernel function (as a wrapper) for enqueue_kernel which would call > a block instead. What do you think about it? I think the kernel prototype > would be fairly generic as it would just have a block call inside and pass > the arguments into it... We won't need to modify block generation then at > all. Will emit a wrapper kernel which calls the block invoke function and keep the block invoke function unchanged. https://reviews.llvm.org/D38134 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D38134: [OpenCL] Emit enqueued block as kernel
Anastasia added a comment. I think we should add a test case when the same block is both called and enqueued. Comment at: lib/CodeGen/CGOpenCLRuntime.cpp:113 + +llvm::Value *CGOpenCLRuntime::emitOpenCLEnqueuedBlock(CodeGenFunction &CGF, + const Expr *E) { yaxunl wrote: > Anastasia wrote: > > yaxunl wrote: > > > Anastasia wrote: > > > > I am not particularly in favour of duplicating CodeGen functionality as > > > > it typically has so many special cases that are hard to catch. Is this > > > > logic needed in order to pass to block literal information that the > > > > block is enqueued? > > > This code is needed to emit separate functions for a block which is > > > directly called and also enqueued as a kernel. Since the kernel needs to > > > have proper calling convention and ABI, it cannot be emitted as the same > > > function as when the block is called directly. Since it is OpenCL > > > specific code, I found it is cleaner to separate this code as member of > > > CGOpenCLRuntime instead of fitting it into CGF.EmitBlockLiteral. > > This part is replacing standard `EmitScalarExpr` call which is doing > > several things before calling into block generation. That's why I am a bit > > worried we are covering all the corner cases here. > > > > So if we transform all blocks into kernels unconditionally we won't need > > this special handling then? > > > > Do we generate two versions of the blocks now: one for enqueue and one for > > call? > If we transform all blocks into kernels, we could simplify the logic. > Probably will not need this special handling. > > However, when the block is called directly, the first argument is a private > pointer, when it is executed as a kernel, the first argument is a global > pointer or a struct (for amdgpu target), therefore the emitted functions > cannot be the same. Would using generic address space for this first argument not work? Comment at: test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl:3 + +// CHECK: %[[S1:struct.__amdgpu_block_arg_t.*]] = type { [3 x i64], [1 x i8] } +// CHECK: %[[S2:struct.__amdgpu_block_arg_t.*]] = type { [5 x i64], [1 x i8] } yaxunl wrote: > Anastasia wrote: > > This struct is not identical to block literal struct? > The LLVM type of the first argument of block invoke function is created > directly with sorting and rearrangement. There is no AST type corresponding > to it. However, the function codegen requires AST type of this argument. I > feel it is unnecessary to create the corresponding AST type. For simplicity, > just create an AST type with the same size and alignment as the LLVM type. In > the function code, it will be bitcasted to the correct LLVM struct type and > get the captured variables. So `void ptr` won't be possible here? Since it is cast to a right struct inside the block anyway. Once again a block is a special type object with known semantic to compiler and runtime in contract to kernels that can be written with any arbitrary type of arguments. I just don't like the idea of duplicating the block invoke function in case it's being both called and enqueued. Also the login in blocks code generation becomes difficult to understand. So I am wondering if we could perhaps create a separate kernel function (as a wrapper) for enqueue_kernel which would call a block instead. What do you think about it? I think the kernel prototype would be fairly generic as it would just have a block call inside and pass the arguments into it... We won't need to modify block generation then at all. https://reviews.llvm.org/D38134 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D38134: [OpenCL] Emit enqueued block as kernel
yaxunl marked 10 inline comments as done. yaxunl added inline comments. Comment at: lib/CodeGen/CGOpenCLRuntime.cpp:113 + +llvm::Value *CGOpenCLRuntime::emitOpenCLEnqueuedBlock(CodeGenFunction &CGF, + const Expr *E) { Anastasia wrote: > yaxunl wrote: > > Anastasia wrote: > > > I am not particularly in favour of duplicating CodeGen functionality as > > > it typically has so many special cases that are hard to catch. Is this > > > logic needed in order to pass to block literal information that the > > > block is enqueued? > > This code is needed to emit separate functions for a block which is > > directly called and also enqueued as a kernel. Since the kernel needs to > > have proper calling convention and ABI, it cannot be emitted as the same > > function as when the block is called directly. Since it is OpenCL specific > > code, I found it is cleaner to separate this code as member of > > CGOpenCLRuntime instead of fitting it into CGF.EmitBlockLiteral. > This part is replacing standard `EmitScalarExpr` call which is doing several > things before calling into block generation. That's why I am a bit worried we > are covering all the corner cases here. > > So if we transform all blocks into kernels unconditionally we won't need this > special handling then? > > Do we generate two versions of the blocks now: one for enqueue and one for > call? If we transform all blocks into kernels, we could simplify the logic. Probably will not need this special handling. However, when the block is called directly, the first argument is a private pointer, when it is executed as a kernel, the first argument is a global pointer or a struct (for amdgpu target), therefore the emitted functions cannot be the same. Comment at: lib/CodeGen/CodeGenFunction.cpp:535 +if (i == 0 && IsBlock) { + ty = CGF.CGM.getTargetCodeGenInfo().getEnqueuedBlockArgumentType( + ASTCtx, *CGF.BlockInfo); Anastasia wrote: > yaxunl wrote: > > Anastasia wrote: > > > I don't quite understand why we need to special case this? As far as I > > > undertsnad block argument is a `generic void* ` type but it's being cast > > > to a concrete block struct inside the block function. Do we gain anything > > > from having it a specific type here? > > This argument is not part of BlockDecl. BlockDecl only has arguments shown > > up in the source code. The first argument in the LLVM block invoke function > > is generated by codegen and there is no correspondence in AST, so it has to > > be handled as a special case. > Considering that enqueued kernel always takes the same type of the arguments > (`local void*`) and # of args is specified in `enqueue_kernel`, I was > wondering whether we need to generate the information on the kernel > parameters at all? The enqueueing side will have the information provided in > the `enqueue_kernel` code. > > As for the block itself it can be passed as `generic void*` and then cast to > the block struct type inside the block itself. amdgpu backend relies on kernel argument metadata to generate some metadata in elf for runtime to launch the kernel. The backend expects the kernel argument metadata on each kernel argument. Not generating kernel metadata on the first kernel argument requires special handling in the backend. I think it is better to let Clang generate kernel argument metadata for all kernel arguments. Comment at: lib/CodeGen/CodeGenFunction.cpp:667 llvm::MDNode::get(Context, argTypeQuals)); - if (CGM.getCodeGenOpts().EmitOpenCLArgMetadata) + if (CGF.CGM.getCodeGenOpts().EmitOpenCLArgMetadata) Fn->setMetadata("kernel_arg_name", Anastasia wrote: > Why this change? CGM is no longer a function parameter since now this function requires a CGF parameter. Comment at: test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl:3 + +// CHECK: %[[S1:struct.__amdgpu_block_arg_t.*]] = type { [3 x i64], [1 x i8] } +// CHECK: %[[S2:struct.__amdgpu_block_arg_t.*]] = type { [5 x i64], [1 x i8] } Anastasia wrote: > This struct is not identical to block literal struct? The LLVM type of the first argument of block invoke function is created directly with sorting and rearrangement. There is no AST type corresponding to it. However, the function codegen requires AST type of this argument. I feel it is unnecessary to create the corresponding AST type. For simplicity, just create an AST type with the same size and alignment as the LLVM type. In the function code, it will be bitcasted to the correct LLVM struct type and get the captured variables. https://reviews.llvm.org/D38134 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D38134: [OpenCL] Emit enqueued block as kernel
Anastasia added a comment. >> I feel it would be much simpler if we could always generate the kernel >> metadata for blocks. A lot of special case code would be removed if we do >> this. OpenCL doesn't prevent kernel functions to be used just as normal >> functions (6.7.1) so it should be a perfectly valid thing to do. Do you seen >> any issues with that? > > The special cases in metadata generation code is due to the first argument of > LLVM block invoke function is not defined in BlockDecl. Emitting metadata for > all block invoke functions does not help here. To be more specific. I am just wondering what do we need for blocks to be used as kernels pragmatically. I feel it is essentially kernel calling convention and kernel metadata? The kernel arguments metadata however can be omitted because their type is fixed to be `local void*` and the number of arguments is passed into `enqueue_kernel` call so it is known at the enqueueing side too. The block descriptor can be passed as a generic pointer `generic void*` as it is cast to the right struct type inside the block invoke function anyway. So if we do this we can avoid adding a lot of extra code. Because blocks have reduced functionality compared to kernel functions. Also OpenCL allows kernel functions to be called just as normal functions so this way we can support second use case for blocks too. What do you think about it? Comment at: lib/CodeGen/CGOpenCLRuntime.cpp:113 + +llvm::Value *CGOpenCLRuntime::emitOpenCLEnqueuedBlock(CodeGenFunction &CGF, + const Expr *E) { yaxunl wrote: > Anastasia wrote: > > I am not particularly in favour of duplicating CodeGen functionality as it > > typically has so many special cases that are hard to catch. Is this logic > > needed in order to pass to block literal information that the block is > > enqueued? > This code is needed to emit separate functions for a block which is directly > called and also enqueued as a kernel. Since the kernel needs to have proper > calling convention and ABI, it cannot be emitted as the same function as when > the block is called directly. Since it is OpenCL specific code, I found it is > cleaner to separate this code as member of CGOpenCLRuntime instead of fitting > it into CGF.EmitBlockLiteral. This part is replacing standard `EmitScalarExpr` call which is doing several things before calling into block generation. That's why I am a bit worried we are covering all the corner cases here. So if we transform all blocks into kernels unconditionally we won't need this special handling then? Do we generate two versions of the blocks now: one for enqueue and one for call? Comment at: lib/CodeGen/CodeGenFunction.cpp:535 +if (i == 0 && IsBlock) { + ty = CGF.CGM.getTargetCodeGenInfo().getEnqueuedBlockArgumentType( + ASTCtx, *CGF.BlockInfo); yaxunl wrote: > Anastasia wrote: > > I don't quite understand why we need to special case this? As far as I > > undertsnad block argument is a `generic void* ` type but it's being cast to > > a concrete block struct inside the block function. Do we gain anything from > > having it a specific type here? > This argument is not part of BlockDecl. BlockDecl only has arguments shown up > in the source code. The first argument in the LLVM block invoke function is > generated by codegen and there is no correspondence in AST, so it has to be > handled as a special case. Considering that enqueued kernel always takes the same type of the arguments (`local void*`) and # of args is specified in `enqueue_kernel`, I was wondering whether we need to generate the information on the kernel parameters at all? The enqueueing side will have the information provided in the `enqueue_kernel` code. As for the block itself it can be passed as `generic void*` and then cast to the block struct type inside the block itself. Comment at: lib/CodeGen/CodeGenFunction.cpp:667 llvm::MDNode::get(Context, argTypeQuals)); - if (CGM.getCodeGenOpts().EmitOpenCLArgMetadata) + if (CGF.CGM.getCodeGenOpts().EmitOpenCLArgMetadata) Fn->setMetadata("kernel_arg_name", Why this change? Comment at: test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl:3 + +// CHECK: %[[S1:struct.__amdgpu_block_arg_t.*]] = type { [3 x i64], [1 x i8] } +// CHECK: %[[S2:struct.__amdgpu_block_arg_t.*]] = type { [5 x i64], [1 x i8] } This struct is not identical to block literal struct? https://reviews.llvm.org/D38134 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D38134: [OpenCL] Emit enqueued block as kernel
yaxunl marked 3 inline comments as done. yaxunl added a comment. In https://reviews.llvm.org/D38134#877831, @Anastasia wrote: > Now if we have a block which is being called and enqueued at the same time, > will we generate 2 functions for it? Could we add such test case btw? Yes. It is covered by test/CodeGenOpenCL/cl20-device-side-enqueue.cl, line 246, 250, and 256. > I feel it would be much simpler if we could always generate the kernel > metadata for blocks. A lot of special case code would be removed if we do > this. OpenCL doesn't prevent kernel functions to be used just as normal > functions (6.7.1) so it should be a perfectly valid thing to do. Do you seen > any issues with that? The special cases in metadata generation code is due to the first argument of LLVM block invoke function is not defined in BlockDecl. Emitting metadata for all block invoke functions does not help here. Comment at: lib/CodeGen/CGBlocks.cpp:1255 // Allocate a stack slot to let the debug info survive the RA. -Address alloc = CreateMemTemp(D->getType(), D->getName() + ".addr"); +Address alloc = CreateMemTemp( +!PV.isIndirect() ? D->getType() Anastasia wrote: > Is there any test that covers this? Yes. This is covered by test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl where the block struct is passed directly to the kernel instead of by a pointer. Comment at: lib/CodeGen/CGOpenCLRuntime.cpp:113 + +llvm::Value *CGOpenCLRuntime::emitOpenCLEnqueuedBlock(CodeGenFunction &CGF, + const Expr *E) { Anastasia wrote: > I am not particularly in favour of duplicating CodeGen functionality as it > typically has so many special cases that are hard to catch. Is this logic > needed in order to pass to block literal information that the block is > enqueued? This code is needed to emit separate functions for a block which is directly called and also enqueued as a kernel. Since the kernel needs to have proper calling convention and ABI, it cannot be emitted as the same function as when the block is called directly. Since it is OpenCL specific code, I found it is cleaner to separate this code as member of CGOpenCLRuntime instead of fitting it into CGF.EmitBlockLiteral. Comment at: lib/CodeGen/CodeGenFunction.cpp:535 +if (i == 0 && IsBlock) { + ty = CGF.CGM.getTargetCodeGenInfo().getEnqueuedBlockArgumentType( + ASTCtx, *CGF.BlockInfo); Anastasia wrote: > I don't quite understand why we need to special case this? As far as I > undertsnad block argument is a `generic void* ` type but it's being cast to a > concrete block struct inside the block function. Do we gain anything from > having it a specific type here? This argument is not part of BlockDecl. BlockDecl only has arguments shown up in the source code. The first argument in the LLVM block invoke function is generated by codegen and there is no correspondence in AST, so it has to be handled as a special case. https://reviews.llvm.org/D38134 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D38134: [OpenCL] Emit enqueued block as kernel
Anastasia added a comment. Now if we have a block which is being called and enqueued at the same time, will we generate 2 functions for it? Could we add such test case btw? I feel it would be much simpler if we could always generate the kernel metadata for blocks. A lot of special case code would be removed if we do this. OpenCL doesn't prevent kernel functions to be used just as normal functions (6.7.1) so it should be a perfectly valid thing to do. Do you seen any issues with that? Comment at: lib/CodeGen/CGBlocks.cpp:1255 // Allocate a stack slot to let the debug info survive the RA. -Address alloc = CreateMemTemp(D->getType(), D->getName() + ".addr"); +Address alloc = CreateMemTemp( +!PV.isIndirect() ? D->getType() Is there any test that covers this? Comment at: lib/CodeGen/CGOpenCLRuntime.cpp:113 + +llvm::Value *CGOpenCLRuntime::emitOpenCLEnqueuedBlock(CodeGenFunction &CGF, + const Expr *E) { I am not particularly in favour of duplicating CodeGen functionality as it typically has so many special cases that are hard to catch. Is this logic needed in order to pass to block literal information that the block is enqueued? Comment at: lib/CodeGen/CodeGenFunction.cpp:535 +if (i == 0 && IsBlock) { + ty = CGF.CGM.getTargetCodeGenInfo().getEnqueuedBlockArgumentType( + ASTCtx, *CGF.BlockInfo); I don't quite understand why we need to special case this? As far as I undertsnad block argument is a `generic void* ` type but it's being cast to a concrete block struct inside the block function. Do we gain anything from having it a specific type here? https://reviews.llvm.org/D38134 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D38134: [OpenCL] Emit enqueued block as kernel
yaxunl created this revision. Herald added a subscriber: nhaehnle. In OpenCL the kernel function and non-kernel function has different calling conventions. For certain targets they have different argument ABIs. Also kernels have special function attributes and metadata for runtime to launch them. The blocks passed to enqueue_kernel is supposed to be executed as kernels. As such, the block invoke function should be emitted as kernel with proper calling convention, argument ABI, function attributes and metadata. However, currently Clang does not emit enqueued blocks as kernels, which causes issues such as address space of captured variables. This patch emits enqueued block as kernel. If a block is both called directly and passed to enqueue_kernel, separate functions will be generated. https://reviews.llvm.org/D38134 Files: lib/CodeGen/CGBlocks.cpp lib/CodeGen/CGBlocks.h lib/CodeGen/CGBuiltin.cpp lib/CodeGen/CGCall.cpp lib/CodeGen/CGDecl.cpp lib/CodeGen/CGOpenCLRuntime.cpp lib/CodeGen/CGOpenCLRuntime.h lib/CodeGen/CodeGenFunction.cpp lib/CodeGen/CodeGenFunction.h lib/CodeGen/CodeGenTypes.h lib/CodeGen/TargetInfo.cpp lib/CodeGen/TargetInfo.h test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl test/CodeGenOpenCL/blocks.cl test/CodeGenOpenCL/cl20-device-side-enqueue.cl Index: test/CodeGenOpenCL/cl20-device-side-enqueue.cl === --- test/CodeGenOpenCL/cl20-device-side-enqueue.cl +++ test/CodeGenOpenCL/cl20-device-side-enqueue.cl @@ -6,10 +6,33 @@ typedef void (^bl_t)(local void *); typedef struct {int a;} ndrange_t; -// N.B. The check here only exists to set BL_GLOBAL -// COMMON: @block_G = addrspace(1) constant void (i8 addrspace(3)*) addrspace(4)* addrspacecast (void (i8 addrspace(3)*) addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_GLOBAL:@__block_literal_global(\.[0-9]+)?]] to void (i8 addrspace(3)*) addrspace(1)*) to void (i8 addrspace(3)*) addrspace(4)*) +// COMMON: %struct.__opencl_block_literal_generic = type { i32, i32, i8 addrspace(4)* } + +// For a block global variable, first emit the block literal as a global variable, then emit the block variable itself. +// COMMON: [[BL_GLOBAL:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INV_G:@[^ ]+]] to i8*) to i8 addrspace(4)*) } +// COMMON: @block_G = addrspace(1) constant void (i8 addrspace(3)*) addrspace(4)* addrspacecast (void (i8 addrspace(3)*) addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_GLOBAL]] to void (i8 addrspace(3)*) addrspace(1)*) to void (i8 addrspace(3)*) addrspace(4)*) + +// For anonymous blocks without captures, emit block literals as global variable. +// COMMON: [[BLG1:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(1)*, i8 addrspace(3)*)* [[INVG1:@[^ ]+]] to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG2:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(1)*, i8 addrspace(3)*)* [[INVG2:@[^ ]+]] to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG3:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(1)*, i8 addrspace(3)*)* [[INVG3:@[^ ]+]] to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG4:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(1)*, i8 addrspace(3)*)* [[INVG4:@[^ ]+]] to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG5:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(1)*, i8 addrspace(3)*)* [[INVG5:@[^ ]+]] to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG6:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(1)*, i8 addrspace(3)*, i8 addrspace(3)*, i8 addrspace(3)*)* [[INVG6:@[^ ]+]] to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG7:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(1)*, i8 addrspace(3)*)* [[INVG7:@[^ ]+]] to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG8:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i