[PATCH] D50104: [OpenCL] Always emit alloca in entry block for enqueue_kernel builtin
This revision was automatically updated to reflect the committed changes. Closed by commit rL338899: [OpenCL] Always emit alloca in entry block for enqueue_kernel builtin (authored by scott.linder, committed by ). Herald added a subscriber: llvm-commits. Changed prior to commit: https://reviews.llvm.org/D50104?vs=158816&id=159021#toc Repository: rL LLVM https://reviews.llvm.org/D50104 Files: cfe/trunk/lib/CodeGen/CGBuiltin.cpp cfe/trunk/test/CodeGenOpenCL/cl20-device-side-enqueue.cl cfe/trunk/test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl Index: cfe/trunk/test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl === --- cfe/trunk/test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl +++ cfe/trunk/test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl @@ -0,0 +1,31 @@ +// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -emit-llvm -o - -triple amdgcn < %s | FileCheck %s --check-prefixes=COMMON,AMDGPU +// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -emit-llvm -o - -triple "spir-unknown-unknown" < %s | FileCheck %s --check-prefixes=COMMON,SPIR32 +// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -emit-llvm -o - -triple "spir64-unknown-unknown" < %s | FileCheck %s --check-prefixes=COMMON,SPIR64 +// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -debug-info-kind=limited -emit-llvm -o - -triple amdgcn < %s | FileCheck %s --check-prefixes=CHECK-DEBUG + +// Check that the enqueue_kernel array temporary is in the entry block to avoid +// a dynamic alloca + +typedef struct {int a;} ndrange_t; + +kernel void test(int i) { +// COMMON-LABEL: define {{.*}} void @test +// COMMON-LABEL: entry: +// AMDGPU: %block_sizes = alloca [1 x i64] +// SPIR32: %block_sizes = alloca [1 x i32] +// SPIR64: %block_sizes = alloca [1 x i64] +// COMMON-LABEL: if.then: +// COMMON-NOT: alloca +// CHECK-DEBUG: getelementptr {{.*}} %block_sizes, {{.*}} !dbg !34 +// COMMON-LABEL: if.end + queue_t default_queue; + unsigned flags = 0; + ndrange_t ndrange; + if (i) +enqueue_kernel(default_queue, flags, ndrange, ^(local void *a) { }, 32); +} + +// Check that the temporary is scoped to the `if` + +// CHECK-DEBUG: !32 = distinct !DILexicalBlock(scope: !7, file: !1, line: 24) +// CHECK-DEBUG: !34 = !DILocation(line: 25, scope: !32) Index: cfe/trunk/test/CodeGenOpenCL/cl20-device-side-enqueue.cl === --- cfe/trunk/test/CodeGenOpenCL/cl20-device-side-enqueue.cl +++ cfe/trunk/test/CodeGenOpenCL/cl20-device-side-enqueue.cl @@ -1,5 +1,6 @@ // RUN: %clang_cc1 %s -cl-std=CL2.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir-unknown-unknown" | FileCheck %s --check-prefix=COMMON --check-prefix=B32 // RUN: %clang_cc1 %s -cl-std=CL2.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir64-unknown-unknown" | FileCheck %s --check-prefix=COMMON --check-prefix=B64 +// RUN: %clang_cc1 %s -cl-std=CL2.0 -ffake-address-space-map -O1 -emit-llvm -o - -triple "spir64-unknown-unknown" | FileCheck %s --check-prefix=CHECK-LIFETIMES #pragma OPENCL EXTENSION cl_khr_subgroups : enable @@ -46,8 +47,31 @@ // COMMON: %event_wait_list2 = alloca [1 x %opencl.clk_event_t*] clk_event_t event_wait_list2[] = {clk_event}; - // Emits block literal on stack and block kernel [[INVLK1]]. // COMMON: [[NDR:%[a-z0-9]+]] = alloca %struct.ndrange_t, align 4 + + // B32: %[[BLOCK_SIZES1:.*]] = alloca [1 x i32] + // B64: %[[BLOCK_SIZES1:.*]] = alloca [1 x i64] + // CHECK-LIFETIMES: %[[BLOCK_SIZES1:.*]] = alloca [1 x i64] + // B32: %[[BLOCK_SIZES2:.*]] = alloca [1 x i32] + // B64: %[[BLOCK_SIZES2:.*]] = alloca [1 x i64] + // CHECK-LIFETIMES: %[[BLOCK_SIZES2:.*]] = alloca [1 x i64] + // B32: %[[BLOCK_SIZES3:.*]] = alloca [1 x i32] + // B64: %[[BLOCK_SIZES3:.*]] = alloca [1 x i64] + // CHECK-LIFETIMES: %[[BLOCK_SIZES3:.*]] = alloca [1 x i64] + // B32: %[[BLOCK_SIZES4:.*]] = alloca [1 x i32] + // B64: %[[BLOCK_SIZES4:.*]] = alloca [1 x i64] + // CHECK-LIFETIMES: %[[BLOCK_SIZES4:.*]] = alloca [1 x i64] + // B32: %[[BLOCK_SIZES5:.*]] = alloca [1 x i32] + // B64: %[[BLOCK_SIZES5:.*]] = alloca [1 x i64] + // CHECK-LIFETIMES: %[[BLOCK_SIZES5:.*]] = alloca [1 x i64] + // B32: %[[BLOCK_SIZES6:.*]] = alloca [3 x i32] + // B64: %[[BLOCK_SIZES6:.*]] = alloca [3 x i64] + // CHECK-LIFETIMES: %[[BLOCK_SIZES6:.*]] = alloca [3 x i64] + // B32: %[[BLOCK_SIZES7:.*]] = alloca [1 x i32] + // B64: %[[BLOCK_SIZES7:.*]] = alloca [1 x i64] + // CHECK-LIFETIMES: %[[BLOCK_SIZES7:.*]] = alloca [1 x i64] + + // Emits block literal on stack and block kernel [[INVLK1]]. // COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t{{.*}}*, %opencl.queue_t{{.*}}** %default_queue // COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags // B32: [[BL:%[0-9]+]] = bitcast <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block to void ()* @@ -73,48 +97,54 @@ // COMMON-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t{{.
[PATCH] D50104: [OpenCL] Always emit alloca in entry block for enqueue_kernel builtin
yaxunl accepted this revision. yaxunl added a comment. LGTM. Thanks! https://reviews.llvm.org/D50104 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D50104: [OpenCL] Always emit alloca in entry block for enqueue_kernel builtin
scott.linder updated this revision to Diff 158816. scott.linder added a comment. Emit lifetime intrinsics for the sizes temp, and update test https://reviews.llvm.org/D50104 Files: lib/CodeGen/CGBuiltin.cpp test/CodeGenOpenCL/cl20-device-side-enqueue.cl test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl Index: test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl === --- /dev/null +++ test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl @@ -0,0 +1,31 @@ +// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -emit-llvm -o - -triple amdgcn < %s | FileCheck %s --check-prefixes=COMMON,AMDGPU +// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -emit-llvm -o - -triple "spir-unknown-unknown" < %s | FileCheck %s --check-prefixes=COMMON,SPIR32 +// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -emit-llvm -o - -triple "spir64-unknown-unknown" < %s | FileCheck %s --check-prefixes=COMMON,SPIR64 +// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -debug-info-kind=limited -emit-llvm -o - -triple amdgcn < %s | FileCheck %s --check-prefixes=CHECK-DEBUG + +// Check that the enqueue_kernel array temporary is in the entry block to avoid +// a dynamic alloca + +typedef struct {int a;} ndrange_t; + +kernel void test(int i) { +// COMMON-LABEL: define {{.*}} void @test +// COMMON-LABEL: entry: +// AMDGPU: %block_sizes = alloca [1 x i64] +// SPIR32: %block_sizes = alloca [1 x i32] +// SPIR64: %block_sizes = alloca [1 x i64] +// COMMON-LABEL: if.then: +// COMMON-NOT: alloca +// CHECK-DEBUG: getelementptr {{.*}} %block_sizes, {{.*}} !dbg !34 +// COMMON-LABEL: if.end + queue_t default_queue; + unsigned flags = 0; + ndrange_t ndrange; + if (i) +enqueue_kernel(default_queue, flags, ndrange, ^(local void *a) { }, 32); +} + +// Check that the temporary is scoped to the `if` + +// CHECK-DEBUG: !32 = distinct !DILexicalBlock(scope: !7, file: !1, line: 24) +// CHECK-DEBUG: !34 = !DILocation(line: 25, scope: !32) Index: test/CodeGenOpenCL/cl20-device-side-enqueue.cl === --- test/CodeGenOpenCL/cl20-device-side-enqueue.cl +++ test/CodeGenOpenCL/cl20-device-side-enqueue.cl @@ -1,5 +1,6 @@ // RUN: %clang_cc1 %s -cl-std=CL2.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir-unknown-unknown" | FileCheck %s --check-prefix=COMMON --check-prefix=B32 // RUN: %clang_cc1 %s -cl-std=CL2.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir64-unknown-unknown" | FileCheck %s --check-prefix=COMMON --check-prefix=B64 +// RUN: %clang_cc1 %s -cl-std=CL2.0 -ffake-address-space-map -O1 -emit-llvm -o - -triple "spir64-unknown-unknown" | FileCheck %s --check-prefix=CHECK-LIFETIMES #pragma OPENCL EXTENSION cl_khr_subgroups : enable @@ -46,8 +47,31 @@ // COMMON: %event_wait_list2 = alloca [1 x %opencl.clk_event_t*] clk_event_t event_wait_list2[] = {clk_event}; - // Emits block literal on stack and block kernel [[INVLK1]]. // COMMON: [[NDR:%[a-z0-9]+]] = alloca %struct.ndrange_t, align 4 + + // B32: %[[BLOCK_SIZES1:.*]] = alloca [1 x i32] + // B64: %[[BLOCK_SIZES1:.*]] = alloca [1 x i64] + // CHECK-LIFETIMES: %[[BLOCK_SIZES1:.*]] = alloca [1 x i64] + // B32: %[[BLOCK_SIZES2:.*]] = alloca [1 x i32] + // B64: %[[BLOCK_SIZES2:.*]] = alloca [1 x i64] + // CHECK-LIFETIMES: %[[BLOCK_SIZES2:.*]] = alloca [1 x i64] + // B32: %[[BLOCK_SIZES3:.*]] = alloca [1 x i32] + // B64: %[[BLOCK_SIZES3:.*]] = alloca [1 x i64] + // CHECK-LIFETIMES: %[[BLOCK_SIZES3:.*]] = alloca [1 x i64] + // B32: %[[BLOCK_SIZES4:.*]] = alloca [1 x i32] + // B64: %[[BLOCK_SIZES4:.*]] = alloca [1 x i64] + // CHECK-LIFETIMES: %[[BLOCK_SIZES4:.*]] = alloca [1 x i64] + // B32: %[[BLOCK_SIZES5:.*]] = alloca [1 x i32] + // B64: %[[BLOCK_SIZES5:.*]] = alloca [1 x i64] + // CHECK-LIFETIMES: %[[BLOCK_SIZES5:.*]] = alloca [1 x i64] + // B32: %[[BLOCK_SIZES6:.*]] = alloca [3 x i32] + // B64: %[[BLOCK_SIZES6:.*]] = alloca [3 x i64] + // CHECK-LIFETIMES: %[[BLOCK_SIZES6:.*]] = alloca [3 x i64] + // B32: %[[BLOCK_SIZES7:.*]] = alloca [1 x i32] + // B64: %[[BLOCK_SIZES7:.*]] = alloca [1 x i64] + // CHECK-LIFETIMES: %[[BLOCK_SIZES7:.*]] = alloca [1 x i64] + + // Emits block literal on stack and block kernel [[INVLK1]]. // COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t{{.*}}*, %opencl.queue_t{{.*}}** %default_queue // COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags // B32: [[BL:%[0-9]+]] = bitcast <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block to void ()* @@ -73,48 +97,54 @@ // COMMON-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t{{.*}}* addrspace(4)* [[WAIT_EVNT]], %opencl.clk_event_t{{.*}}* addrspace(4)* [[EVNT]], // COMMON-SAME: i8 addrspace(4)* addrspacecast (i8* bitcast ({{.*}} [[INVLK2:[^ ]+_kernel]] to i8*) to i8 addrspace(4)*), // COMMON-SAME: i8 addrspace(4)* [[BL_I8]]) - enqueue_kernel(default_queue, flags, ndrange, 2, &event_wait_list, &clk_event,
[PATCH] D50104: [OpenCL] Always emit alloca in entry block for enqueue_kernel builtin
yaxunl added a comment. In https://reviews.llvm.org/D50104#1185920, @scott.linder wrote: > I still don't quite see what you describe; with that change all of the > lifetime.end calls pile up just before the enclosing function returns, not > after each call to enqueue_kernel. Looking at > https://clang.llvm.org/doxygen/EHScopeStack_8h_source.html#l00078 I don't see > any option which isn't based on scope. The lifetime.start calls do occur > where I would expect, though, so I will update the patch. Sorry my mistake. In this case, the full expressions seems to be the calling function, so using pushFullExprCleanup to emit lifetime.end does not work well here. You need to call EmitLifetimeEnd explicitly after emitting the function call. https://reviews.llvm.org/D50104 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D50104: [OpenCL] Always emit alloca in entry block for enqueue_kernel builtin
scott.linder added a comment. I still don't quite see what you describe; with that change all of the lifetime.end calls pile up just before the enclosing function returns, not after each call to enqueue_kernel. Looking at https://clang.llvm.org/doxygen/EHScopeStack_8h_source.html#l00078 I don't see any option which isn't based on scope. The lifetime.start calls do occur where I would expect, though, so I will update the patch. https://reviews.llvm.org/D50104 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D50104: [OpenCL] Always emit alloca in entry block for enqueue_kernel builtin
yaxunl added a comment. In https://reviews.llvm.org/D50104#1184362, @scott.linder wrote: > Address feedback; I hope I understood correctly what debug info to check for. > > I don't see where in CreateMemTemp and friends EmitLifetimeStart gets called, > and I don't see any lifetime intrinsics in the IR even at -O1. Emitting lifetime intrinsic is optional. In this case, since the life time of the temp var is just before and after the function call, emitting lifetime intrinsics can help optimizers. It can be done by code like this: if (auto *Size = EmitLifetimeStart( CGM.getDataLayout().getTypeAllocSize(Alloca.getElementType()), Alloca.getPointer())) { pushFullExprCleanup(NormalEHLifetimeMarker, Alloca, Size); } Then the lifetime.start should be emitted before the function call and lifetime.end should be emitted just after the function call. https://reviews.llvm.org/D50104 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D50104: [OpenCL] Always emit alloca in entry block for enqueue_kernel builtin
svenvh accepted this revision. svenvh added a comment. This revision is now accepted and ready to land. LGTM, thanks! https://reviews.llvm.org/D50104 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D50104: [OpenCL] Always emit alloca in entry block for enqueue_kernel builtin
scott.linder updated this revision to Diff 158618. scott.linder added a comment. Update test https://reviews.llvm.org/D50104 Files: lib/CodeGen/CGBuiltin.cpp test/CodeGenOpenCL/cl20-device-side-enqueue.cl test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl Index: test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl === --- /dev/null +++ test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl @@ -0,0 +1,31 @@ +// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -emit-llvm -o - -triple amdgcn < %s | FileCheck %s --check-prefixes=COMMON,AMDGPU +// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -emit-llvm -o - -triple "spir-unknown-unknown" < %s | FileCheck %s --check-prefixes=COMMON,SPIR32 +// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -emit-llvm -o - -triple "spir64-unknown-unknown" < %s | FileCheck %s --check-prefixes=COMMON,SPIR64 +// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -debug-info-kind=limited -emit-llvm -o - -triple amdgcn < %s | FileCheck %s --check-prefixes=CHECK-DEBUG + +// Check that the enqueue_kernel array temporary is in the entry block to avoid +// a dynamic alloca + +typedef struct {int a;} ndrange_t; + +kernel void test(int i) { +// COMMON-LABEL: define {{.*}} void @test +// COMMON-LABEL: entry: +// AMDGPU: %block_sizes = alloca [1 x i64] +// SPIR32: %block_sizes = alloca [1 x i32] +// SPIR64: %block_sizes = alloca [1 x i64] +// COMMON-LABEL: if.then: +// COMMON-NOT: alloca +// CHECK-DEBUG: getelementptr {{.*}} %block_sizes, {{.*}} !dbg !34 +// COMMON-LABEL: if.end + queue_t default_queue; + unsigned flags = 0; + ndrange_t ndrange; + if (i) +enqueue_kernel(default_queue, flags, ndrange, ^(local void *a) { }, 32); +} + +// Check that the temporary is scoped to the `if` + +// CHECK-DEBUG: !32 = distinct !DILexicalBlock(scope: !7, file: !1, line: 24) +// CHECK-DEBUG: !34 = !DILocation(line: 25, scope: !32) Index: test/CodeGenOpenCL/cl20-device-side-enqueue.cl === --- test/CodeGenOpenCL/cl20-device-side-enqueue.cl +++ test/CodeGenOpenCL/cl20-device-side-enqueue.cl @@ -46,8 +46,24 @@ // COMMON: %event_wait_list2 = alloca [1 x %opencl.clk_event_t*] clk_event_t event_wait_list2[] = {clk_event}; - // Emits block literal on stack and block kernel [[INVLK1]]. // COMMON: [[NDR:%[a-z0-9]+]] = alloca %struct.ndrange_t, align 4 + + // B32: %[[BLOCK_SIZES1:.*]] = alloca [1 x i32] + // B64: %[[BLOCK_SIZES1:.*]] = alloca [1 x i64] + // B32: %[[BLOCK_SIZES2:.*]] = alloca [1 x i32] + // B64: %[[BLOCK_SIZES2:.*]] = alloca [1 x i64] + // B32: %[[BLOCK_SIZES3:.*]] = alloca [1 x i32] + // B64: %[[BLOCK_SIZES3:.*]] = alloca [1 x i64] + // B32: %[[BLOCK_SIZES4:.*]] = alloca [1 x i32] + // B64: %[[BLOCK_SIZES4:.*]] = alloca [1 x i64] + // B32: %[[BLOCK_SIZES5:.*]] = alloca [1 x i32] + // B64: %[[BLOCK_SIZES5:.*]] = alloca [1 x i64] + // B32: %[[BLOCK_SIZES6:.*]] = alloca [3 x i32] + // B64: %[[BLOCK_SIZES6:.*]] = alloca [3 x i64] + // B32: %[[BLOCK_SIZES7:.*]] = alloca [1 x i32] + // B64: %[[BLOCK_SIZES7:.*]] = alloca [1 x i64] + + // Emits block literal on stack and block kernel [[INVLK1]]. // COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t{{.*}}*, %opencl.queue_t{{.*}}** %default_queue // COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags // B32: [[BL:%[0-9]+]] = bitcast <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block to void ()* @@ -73,48 +89,44 @@ // COMMON-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t{{.*}}* addrspace(4)* [[WAIT_EVNT]], %opencl.clk_event_t{{.*}}* addrspace(4)* [[EVNT]], // COMMON-SAME: i8 addrspace(4)* addrspacecast (i8* bitcast ({{.*}} [[INVLK2:[^ ]+_kernel]] to i8*) to i8 addrspace(4)*), // COMMON-SAME: i8 addrspace(4)* [[BL_I8]]) - enqueue_kernel(default_queue, flags, ndrange, 2, &event_wait_list, &clk_event, ^(void) { a[i] = b[i]; }); // Emits global block literal [[BLG1]] and block kernel [[INVGK1]]. // COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t{{.*}}*, %opencl.queue_t{{.*}}** %default_queue // COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags - // B32: %[[TMP:.*]] = alloca [1 x i32] - // B32: %[[TMP1:.*]] = getelementptr [1 x i32], [1 x i32]* %[[TMP]], i32 0, i32 0 - // B32: store i32 256, i32* %[[TMP1]], align 4 - // B64: %[[TMP:.*]] = alloca [1 x i64] - // B64: %[[TMP1:.*]] = getelementptr [1 x i64], [1 x i64]* %[[TMP]], i32 0, i32 0 - // B64: store i64 256, i64* %[[TMP1]], align 8 + // B32: %[[TMP:.*]] = getelementptr [1 x i32], [1 x i32]* %[[BLOCK_SIZES1]], i32 0, i32 0 + // B32: store i32 256, i32* %[[TMP]], align 4 + // B64: %[[TMP:.*]] = getelementptr [1 x i64], [1 x i64]* %[[BLOCK_SIZES1]], i32 0, i32 0 + // B64: store i64 256, i64* %[[TMP]], align 8 // COMMON-LABEL: call i32 @__enqueue_kernel_varargs( // COMMON-SAME: %opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]]
[PATCH] D50104: [OpenCL] Always emit alloca in entry block for enqueue_kernel builtin
svenvh added a comment. You'll probably also need to update `test/CodeGenOpenCL/cl20-device-side-enqueue.cl`; please verify with make/ninja `check-clang`. https://reviews.llvm.org/D50104 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D50104: [OpenCL] Always emit alloca in entry block for enqueue_kernel builtin
scott.linder updated this revision to Diff 158545. scott.linder added a comment. Address feedback; I hope I understood correctly what debug info to check for. I don't see where in CreateMemTemp and friends EmitLifetimeStart gets called, and I don't see any lifetime intrinsics in the IR even at -O1. https://reviews.llvm.org/D50104 Files: lib/CodeGen/CGBuiltin.cpp test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl Index: test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl === --- /dev/null +++ test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl @@ -0,0 +1,31 @@ +// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -emit-llvm -o - -triple amdgcn < %s | FileCheck %s --check-prefixes=COMMON,AMDGPU +// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -emit-llvm -o - -triple "spir-unknown-unknown" < %s | FileCheck %s --check-prefixes=COMMON,SPIR32 +// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -emit-llvm -o - -triple "spir64-unknown-unknown" < %s | FileCheck %s --check-prefixes=COMMON,SPIR64 +// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -debug-info-kind=limited -emit-llvm -o - -triple amdgcn < %s | FileCheck %s --check-prefixes=CHECK-DEBUG + +// Check that the enqueue_kernel array temporary is in the entry block to avoid +// a dynamic alloca + +typedef struct {int a;} ndrange_t; + +kernel void test(int i) { +// COMMON-LABEL: define {{.*}} void @test +// COMMON-LABEL: entry: +// AMDGPU: %block_sizes = alloca [1 x i64] +// SPIR32: %block_sizes = alloca [1 x i32] +// SPIR64: %block_sizes = alloca [1 x i64] +// COMMON-LABEL: if.then: +// COMMON-NOT: alloca +// CHECK-DEBUG: getelementptr {{.*}} %block_sizes, {{.*}} !dbg !34 +// COMMON-LABEL: if.end + queue_t default_queue; + unsigned flags = 0; + ndrange_t ndrange; + if (i) +enqueue_kernel(default_queue, flags, ndrange, ^(local void *a) { }, 32); +} + +// Check that the temporary is scoped to the `if` + +// CHECK-DEBUG: !32 = distinct !DILexicalBlock(scope: !7, file: !1, line: 24) +// CHECK-DEBUG: !34 = !DILocation(line: 25, scope: !32) Index: lib/CodeGen/CGBuiltin.cpp === --- lib/CodeGen/CGBuiltin.cpp +++ lib/CodeGen/CGBuiltin.cpp @@ -3338,15 +3338,18 @@ // Create a temporary array to hold the sizes of local pointer arguments // for the block. \p First is the position of the first size argument. auto CreateArrayForSizeVar = [=](unsigned First) { - auto *AT = llvm::ArrayType::get(SizeTy, NumArgs - First); - auto *Arr = Builder.CreateAlloca(AT); + llvm::APInt ArraySize(32, NumArgs - First); + QualType SizeArrayTy = getContext().getConstantArrayType( + getContext().getSizeType(), ArraySize, ArrayType::Normal, + /*IndexTypeQuals=*/0); + auto Tmp = CreateMemTemp(SizeArrayTy, "block_sizes"); llvm::Value *Ptr; // Each of the following arguments specifies the size of the corresponding // argument passed to the enqueued block. auto *Zero = llvm::ConstantInt::get(IntTy, 0); for (unsigned I = First; I < NumArgs; ++I) { auto *Index = llvm::ConstantInt::get(IntTy, I - First); -auto *GEP = Builder.CreateGEP(Arr, {Zero, Index}); +auto *GEP = Builder.CreateGEP(Tmp.getPointer(), {Zero, Index}); if (I == First) Ptr = GEP; auto *V = Index: test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl === --- /dev/null +++ test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl @@ -0,0 +1,31 @@ +// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -emit-llvm -o - -triple amdgcn < %s | FileCheck %s --check-prefixes=COMMON,AMDGPU +// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -emit-llvm -o - -triple "spir-unknown-unknown" < %s | FileCheck %s --check-prefixes=COMMON,SPIR32 +// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -emit-llvm -o - -triple "spir64-unknown-unknown" < %s | FileCheck %s --check-prefixes=COMMON,SPIR64 +// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -debug-info-kind=limited -emit-llvm -o - -triple amdgcn < %s | FileCheck %s --check-prefixes=CHECK-DEBUG + +// Check that the enqueue_kernel array temporary is in the entry block to avoid +// a dynamic alloca + +typedef struct {int a;} ndrange_t; + +kernel void test(int i) { +// COMMON-LABEL: define {{.*}} void @test +// COMMON-LABEL: entry: +// AMDGPU: %block_sizes = alloca [1 x i64] +// SPIR32: %block_sizes = alloca [1 x i32] +// SPIR64: %block_sizes = alloca [1 x i64] +// COMMON-LABEL: if.then: +// COMMON-NOT: alloca +// CHECK-DEBUG: getelementptr {{.*}} %block_sizes, {{.*}} !dbg !34 +// COMMON-LABEL: if.end + queue_t default_queue; + unsigned flags = 0; + ndrange_t ndrange; + if (i) +enqueue_kernel(default_queue, flags, ndrange, ^(local void *a) { }, 32); +} + +// Check that the temporary is scoped to the `if` + +// CHECK-DEBUG: !32 = distinct !DILexicalBlock(scope: !7, file: !1, line: 24) +// CHECK-DEBUG: !34 = !DILocation(line: 25, scope: !32) Index: lib/
[PATCH] D50104: [OpenCL] Always emit alloca in entry block for enqueue_kernel builtin
arsenm added a comment. Should this also test for lifetime markers? Repository: rC Clang https://reviews.llvm.org/D50104 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D50104: [OpenCL] Always emit alloca in entry block for enqueue_kernel builtin
yaxunl added inline comments. Comment at: lib/CodeGen/CGBuiltin.cpp:3342 auto *AT = llvm::ArrayType::get(SizeTy, NumArgs - First); + // Always insert the alloca in the entry block so it remains static in + // the SelectionDAG. + BasicBlock *Begin = nullptr; + if (Instruction *Entry = CurFn->getEntryBlock().getTerminator()) { +Begin = Builder.GetInsertBlock(); +Builder.SetInsertPoint(Entry); + } auto *Arr = Builder.CreateAlloca(AT); llvm::Value *Ptr; You may try CreateMemTemp. It should handle the insert position and also debug info. Comment at: test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl:2 +// RUN: %clang_cc1 %s -cl-std=CL2.0 -O0 -emit-llvm -o - -triple amdgcn | FileCheck %s --check-prefixes=COMMON,AMDGPU +// RUN: %clang_cc1 %s -cl-std=CL2.0 -O0 -emit-llvm -o - -triple "spir-unknown-unknown" | FileCheck %s --check-prefixes=COMMON,SPIR32 +// RUN: %clang_cc1 %s -cl-std=CL2.0 -O0 -emit-llvm -o - -triple "spir64-unknown-unknown" | FileCheck %s --check-prefixes=COMMON,SPIR64 Can we have a run line for debug info? Repository: rC Clang https://reviews.llvm.org/D50104 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D50104: [OpenCL] Always emit alloca in entry block for enqueue_kernel builtin
scott.linder created this revision. scott.linder added reviewers: yaxunl, Anastasia, arsenm. Herald added subscribers: cfe-commits, wdng. Ensures the statically sized alloca is not converted to DYNAMIC_STACKALLOC later because it is not in the entry block. I believe it is valid to insert the alloca in the entry block, but I'm not confident the way I accomplish it is correct. Repository: rC Clang https://reviews.llvm.org/D50104 Files: lib/CodeGen/CGBuiltin.cpp test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl Index: test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl === --- /dev/null +++ test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl @@ -0,0 +1,21 @@ +// RUN: %clang_cc1 %s -cl-std=CL2.0 -O0 -emit-llvm -o - -triple amdgcn | FileCheck %s --check-prefixes=COMMON,AMDGPU +// RUN: %clang_cc1 %s -cl-std=CL2.0 -O0 -emit-llvm -o - -triple "spir-unknown-unknown" | FileCheck %s --check-prefixes=COMMON,SPIR32 +// RUN: %clang_cc1 %s -cl-std=CL2.0 -O0 -emit-llvm -o - -triple "spir64-unknown-unknown" | FileCheck %s --check-prefixes=COMMON,SPIR64 + +typedef struct {int a;} ndrange_t; + +kernel void test(int i) { +// COMMON-LABEL: define {{.*}} void @test +// COMMON-LABEL: entry: +// AMDGPU: alloca [1 x i64] +// SPIR32: alloca [1 x i32] +// SPIR64: alloca [1 x i64] +// COMMON-LABEL: if.then: +// COMMON-NOT: alloca +// COMMON-LABEL: if.end + queue_t default_queue; + unsigned flags = 0; + ndrange_t ndrange; + if (i) +enqueue_kernel(default_queue, flags, ndrange, ^(local void *a) { }, 32); +} Index: lib/CodeGen/CGBuiltin.cpp === --- lib/CodeGen/CGBuiltin.cpp +++ lib/CodeGen/CGBuiltin.cpp @@ -3339,7 +3339,16 @@ // for the block. \p First is the position of the first size argument. auto CreateArrayForSizeVar = [=](unsigned First) { auto *AT = llvm::ArrayType::get(SizeTy, NumArgs - First); + // Always insert the alloca in the entry block so it remains static in + // the SelectionDAG. + BasicBlock *Begin = nullptr; + if (Instruction *Entry = CurFn->getEntryBlock().getTerminator()) { +Begin = Builder.GetInsertBlock(); +Builder.SetInsertPoint(Entry); + } auto *Arr = Builder.CreateAlloca(AT); + if (Begin) +Builder.SetInsertPoint(Begin); llvm::Value *Ptr; // Each of the following arguments specifies the size of the corresponding // argument passed to the enqueued block. Index: test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl === --- /dev/null +++ test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl @@ -0,0 +1,21 @@ +// RUN: %clang_cc1 %s -cl-std=CL2.0 -O0 -emit-llvm -o - -triple amdgcn | FileCheck %s --check-prefixes=COMMON,AMDGPU +// RUN: %clang_cc1 %s -cl-std=CL2.0 -O0 -emit-llvm -o - -triple "spir-unknown-unknown" | FileCheck %s --check-prefixes=COMMON,SPIR32 +// RUN: %clang_cc1 %s -cl-std=CL2.0 -O0 -emit-llvm -o - -triple "spir64-unknown-unknown" | FileCheck %s --check-prefixes=COMMON,SPIR64 + +typedef struct {int a;} ndrange_t; + +kernel void test(int i) { +// COMMON-LABEL: define {{.*}} void @test +// COMMON-LABEL: entry: +// AMDGPU: alloca [1 x i64] +// SPIR32: alloca [1 x i32] +// SPIR64: alloca [1 x i64] +// COMMON-LABEL: if.then: +// COMMON-NOT: alloca +// COMMON-LABEL: if.end + queue_t default_queue; + unsigned flags = 0; + ndrange_t ndrange; + if (i) +enqueue_kernel(default_queue, flags, ndrange, ^(local void *a) { }, 32); +} Index: lib/CodeGen/CGBuiltin.cpp === --- lib/CodeGen/CGBuiltin.cpp +++ lib/CodeGen/CGBuiltin.cpp @@ -3339,7 +3339,16 @@ // for the block. \p First is the position of the first size argument. auto CreateArrayForSizeVar = [=](unsigned First) { auto *AT = llvm::ArrayType::get(SizeTy, NumArgs - First); + // Always insert the alloca in the entry block so it remains static in + // the SelectionDAG. + BasicBlock *Begin = nullptr; + if (Instruction *Entry = CurFn->getEntryBlock().getTerminator()) { +Begin = Builder.GetInsertBlock(); +Builder.SetInsertPoint(Entry); + } auto *Arr = Builder.CreateAlloca(AT); + if (Begin) +Builder.SetInsertPoint(Begin); llvm::Value *Ptr; // Each of the following arguments specifies the size of the corresponding // argument passed to the enqueued block. ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits