https://github.com/JonChesterfield created https://github.com/llvm/llvm-project/pull/130956
Regenerated existing test case with include-generated-funcs to show the lowered IR for each instantiation. >From 4ec726e4fcf5ab0b03f3942e42a4dbde1a6f43a4 Mon Sep 17 00:00:00 2001 From: Jon Chesterfield <jonathanchesterfi...@gmail.com> Date: Wed, 12 Mar 2025 13:03:05 +0000 Subject: [PATCH] [libc][nfc] Include instantiations of gpuintrin.h in IR test case --- clang/test/Headers/gpuintrin.c | 871 ++++++++++++++++++++++++++++++--- 1 file changed, 808 insertions(+), 63 deletions(-) diff --git a/clang/test/Headers/gpuintrin.c b/clang/test/Headers/gpuintrin.c index 89efe12ee8def..30aa6f147ba03 100644 --- a/clang/test/Headers/gpuintrin.c +++ b/clang/test/Headers/gpuintrin.c @@ -1,10 +1,10 @@ -// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 -// RUN: %clang_cc1 -internal-isystem %S/Inputs/include \ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --version 5 +// RUN: %clang_cc1 -internal-isystem %S/Inputs/include \ // RUN: -internal-isystem %S/../../lib/Headers/ \ // RUN: -triple amdgcn-amd-amdhsa -emit-llvm %s -o - \ // RUN: | FileCheck %s --check-prefix=AMDGPU // -// RUN: %clang_cc1 -internal-isystem %S/Inputs/include \ +// RUN: %clang_cc1 -internal-isystem %S/Inputs/include \ // RUN: -internal-isystem %S/../../lib/Headers/ \ // RUN: -target-feature +ptx62 \ // RUN: -triple nvptx64-nvidia-cuda -emit-llvm %s -o - \ @@ -12,6 +12,35 @@ #include <gpuintrin.h> +__gpu_kernel void foo() { + __gpu_num_blocks_x(); + __gpu_num_blocks_y(); + __gpu_num_blocks_z(); + __gpu_num_blocks(0); + __gpu_block_id_x(); + __gpu_block_id_y(); + __gpu_block_id_z(); + __gpu_block_id(0); + __gpu_num_threads_x(); + __gpu_num_threads_y(); + __gpu_num_threads_z(); + __gpu_num_threads(0); + __gpu_thread_id_x(); + __gpu_thread_id_y(); + __gpu_thread_id_z(); + __gpu_thread_id(0); + __gpu_num_lanes(); + __gpu_lane_id(); + __gpu_lane_mask(); + __gpu_read_first_lane_u32(-1, -1); + __gpu_ballot(-1, 1); + __gpu_sync_threads(); + __gpu_sync_lane(-1); + __gpu_shuffle_idx_u32(-1, -1, -1, 0); + __gpu_first_lane_id(-1); + __gpu_is_first_in_lane(-1); + __gpu_exit(); +} // AMDGPU-LABEL: define protected amdgpu_kernel void @foo( // AMDGPU-SAME: ) #[[ATTR0:[0-9]+]] { // AMDGPU-NEXT: [[ENTRY:.*:]] @@ -44,52 +73,244 @@ // AMDGPU-NEXT: call void @__gpu_exit() #[[ATTR8:[0-9]+]] // AMDGPU-NEXT: unreachable // -// NVPTX-LABEL: define protected ptx_kernel void @foo( -// NVPTX-SAME: ) #[[ATTR0:[0-9]+]] { -// NVPTX-NEXT: [[ENTRY:.*:]] -// NVPTX-NEXT: [[CALL:%.*]] = call i32 @__gpu_num_blocks_x() #[[ATTR6:[0-9]+]] -// NVPTX-NEXT: [[CALL1:%.*]] = call i32 @__gpu_num_blocks_y() #[[ATTR6]] -// NVPTX-NEXT: [[CALL2:%.*]] = call i32 @__gpu_num_blocks_z() #[[ATTR6]] -// NVPTX-NEXT: [[CALL3:%.*]] = call i32 @__gpu_num_blocks(i32 noundef 0) #[[ATTR6]] -// NVPTX-NEXT: [[CALL4:%.*]] = call i32 @__gpu_block_id_x() #[[ATTR6]] -// NVPTX-NEXT: [[CALL5:%.*]] = call i32 @__gpu_block_id_y() #[[ATTR6]] -// NVPTX-NEXT: [[CALL6:%.*]] = call i32 @__gpu_block_id_z() #[[ATTR6]] -// NVPTX-NEXT: [[CALL7:%.*]] = call i32 @__gpu_block_id(i32 noundef 0) #[[ATTR6]] -// NVPTX-NEXT: [[CALL8:%.*]] = call i32 @__gpu_num_threads_x() #[[ATTR6]] -// NVPTX-NEXT: [[CALL9:%.*]] = call i32 @__gpu_num_threads_y() #[[ATTR6]] -// NVPTX-NEXT: [[CALL10:%.*]] = call i32 @__gpu_num_threads_z() #[[ATTR6]] -// NVPTX-NEXT: [[CALL11:%.*]] = call i32 @__gpu_num_threads(i32 noundef 0) #[[ATTR6]] -// NVPTX-NEXT: [[CALL12:%.*]] = call i32 @__gpu_thread_id_x() #[[ATTR6]] -// NVPTX-NEXT: [[CALL13:%.*]] = call i32 @__gpu_thread_id_y() #[[ATTR6]] -// NVPTX-NEXT: [[CALL14:%.*]] = call i32 @__gpu_thread_id_z() #[[ATTR6]] -// NVPTX-NEXT: [[CALL15:%.*]] = call i32 @__gpu_thread_id(i32 noundef 0) #[[ATTR6]] -// NVPTX-NEXT: [[CALL16:%.*]] = call i32 @__gpu_num_lanes() #[[ATTR6]] -// NVPTX-NEXT: [[CALL17:%.*]] = call i32 @__gpu_lane_id() #[[ATTR6]] -// NVPTX-NEXT: [[CALL18:%.*]] = call i64 @__gpu_lane_mask() #[[ATTR6]] -// NVPTX-NEXT: [[CALL19:%.*]] = call i32 @__gpu_read_first_lane_u32(i64 noundef -1, i32 noundef -1) #[[ATTR6]] -// NVPTX-NEXT: [[CALL20:%.*]] = call i64 @__gpu_ballot(i64 noundef -1, i1 noundef zeroext true) #[[ATTR6]] -// NVPTX-NEXT: call void @__gpu_sync_threads() #[[ATTR6]] -// NVPTX-NEXT: call void @__gpu_sync_lane(i64 noundef -1) #[[ATTR6]] -// NVPTX-NEXT: [[CALL21:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1, i32 noundef 0) #[[ATTR6]] -// NVPTX-NEXT: [[CALL22:%.*]] = call i64 @__gpu_first_lane_id(i64 noundef -1) #[[ATTR6]] -// NVPTX-NEXT: [[CALL23:%.*]] = call zeroext i1 @__gpu_is_first_in_lane(i64 noundef -1) #[[ATTR6]] -// NVPTX-NEXT: call void @__gpu_exit() #[[ATTR7:[0-9]+]] -// NVPTX-NEXT: unreachable // -__gpu_kernel void foo() { - __gpu_num_blocks_x(); - __gpu_num_blocks_y(); - __gpu_num_blocks_z(); - __gpu_num_blocks(0); - __gpu_block_id_x(); - __gpu_block_id_y(); - __gpu_block_id_z(); - __gpu_block_id(0); - __gpu_num_threads_x(); - __gpu_num_threads_y(); - __gpu_num_threads_z(); - __gpu_num_threads(0); - __gpu_thread_id_x(); +// AMDGPU-LABEL: define internal i32 @__gpu_num_blocks_x( +// AMDGPU-SAME: ) #[[ATTR0]] { +// AMDGPU-NEXT: [[ENTRY:.*:]] +// AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGPU-NEXT: [[TMP0:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +// AMDGPU-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP0]], i32 12 +// AMDGPU-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4, !range [[RNG3:![0-9]+]], !invariant.load [[META4:![0-9]+]] +// AMDGPU-NEXT: [[TMP3:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +// AMDGPU-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP3]], i32 12 +// AMDGPU-NEXT: [[TMP5:%.*]] = load i16, ptr addrspace(4) [[TMP4]], align 2, !range [[RNG5:![0-9]+]], !invariant.load [[META4]], !noundef [[META4]] +// AMDGPU-NEXT: [[CONV:%.*]] = zext i16 [[TMP5]] to i32 +// AMDGPU-NEXT: [[DIV:%.*]] = udiv i32 [[TMP2]], [[CONV]] +// AMDGPU-NEXT: ret i32 [[DIV]] +// +// +// AMDGPU-LABEL: define internal i32 @__gpu_num_blocks_y( +// AMDGPU-SAME: ) #[[ATTR0]] { +// AMDGPU-NEXT: [[ENTRY:.*:]] +// AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGPU-NEXT: [[TMP0:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +// AMDGPU-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP0]], i32 16 +// AMDGPU-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4, !range [[RNG3]], !invariant.load [[META4]] +// AMDGPU-NEXT: [[TMP3:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +// AMDGPU-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP3]], i32 14 +// AMDGPU-NEXT: [[TMP5:%.*]] = load i16, ptr addrspace(4) [[TMP4]], align 2, !range [[RNG5]], !invariant.load [[META4]], !noundef [[META4]] +// AMDGPU-NEXT: [[CONV:%.*]] = zext i16 [[TMP5]] to i32 +// AMDGPU-NEXT: [[DIV:%.*]] = udiv i32 [[TMP2]], [[CONV]] +// AMDGPU-NEXT: ret i32 [[DIV]] +// +// +// AMDGPU-LABEL: define internal i32 @__gpu_num_blocks_z( +// AMDGPU-SAME: ) #[[ATTR0]] { +// AMDGPU-NEXT: [[ENTRY:.*:]] +// AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGPU-NEXT: [[TMP0:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +// AMDGPU-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP0]], i32 20 +// AMDGPU-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4, !range [[RNG3]], !invariant.load [[META4]] +// AMDGPU-NEXT: [[TMP3:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +// AMDGPU-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP3]], i32 16 +// AMDGPU-NEXT: [[TMP5:%.*]] = load i16, ptr addrspace(4) [[TMP4]], align 2, !range [[RNG5]], !invariant.load [[META4]], !noundef [[META4]] +// AMDGPU-NEXT: [[CONV:%.*]] = zext i16 [[TMP5]] to i32 +// AMDGPU-NEXT: [[DIV:%.*]] = udiv i32 [[TMP2]], [[CONV]] +// AMDGPU-NEXT: ret i32 [[DIV]] +// +// +// AMDGPU-LABEL: define internal i32 @__gpu_num_blocks( +// AMDGPU-SAME: i32 noundef [[__DIM:%.*]]) #[[ATTR0]] { +// AMDGPU-NEXT: [[ENTRY:.*:]] +// AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGPU-NEXT: [[__DIM_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGPU-NEXT: [[__DIM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__DIM_ADDR]] to ptr +// AMDGPU-NEXT: store i32 [[__DIM]], ptr [[__DIM_ADDR_ASCAST]], align 4 +// AMDGPU-NEXT: [[TMP0:%.*]] = load i32, ptr [[__DIM_ADDR_ASCAST]], align 4 +// AMDGPU-NEXT: switch i32 [[TMP0]], label %[[SW_DEFAULT:.*]] [ +// AMDGPU-NEXT: i32 0, label %[[SW_BB:.*]] +// AMDGPU-NEXT: i32 1, label %[[SW_BB1:.*]] +// AMDGPU-NEXT: i32 2, label %[[SW_BB3:.*]] +// AMDGPU-NEXT: ] +// AMDGPU: [[SW_BB]]: +// AMDGPU-NEXT: [[CALL:%.*]] = call i32 @__gpu_num_blocks_x() #[[ATTR7]] +// AMDGPU-NEXT: store i32 [[CALL]], ptr [[RETVAL_ASCAST]], align 4 +// AMDGPU-NEXT: br label %[[RETURN:.*]] +// AMDGPU: [[SW_BB1]]: +// AMDGPU-NEXT: [[CALL2:%.*]] = call i32 @__gpu_num_blocks_y() #[[ATTR7]] +// AMDGPU-NEXT: store i32 [[CALL2]], ptr [[RETVAL_ASCAST]], align 4 +// AMDGPU-NEXT: br label %[[RETURN]] +// AMDGPU: [[SW_BB3]]: +// AMDGPU-NEXT: [[CALL4:%.*]] = call i32 @__gpu_num_blocks_z() #[[ATTR7]] +// AMDGPU-NEXT: store i32 [[CALL4]], ptr [[RETVAL_ASCAST]], align 4 +// AMDGPU-NEXT: br label %[[RETURN]] +// AMDGPU: [[SW_DEFAULT]]: +// AMDGPU-NEXT: unreachable +// AMDGPU: [[RETURN]]: +// AMDGPU-NEXT: [[TMP1:%.*]] = load i32, ptr [[RETVAL_ASCAST]], align 4 +// AMDGPU-NEXT: ret i32 [[TMP1]] +// +// +// AMDGPU-LABEL: define internal i32 @__gpu_block_id_x( +// AMDGPU-SAME: ) #[[ATTR0]] { +// AMDGPU-NEXT: [[ENTRY:.*:]] +// AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() +// AMDGPU-NEXT: ret i32 [[TMP0]] +// +// +// AMDGPU-LABEL: define internal i32 @__gpu_block_id_y( +// AMDGPU-SAME: ) #[[ATTR0]] { +// AMDGPU-NEXT: [[ENTRY:.*:]] +// AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() +// AMDGPU-NEXT: ret i32 [[TMP0]] +// +// +// AMDGPU-LABEL: define internal i32 @__gpu_block_id_z( +// AMDGPU-SAME: ) #[[ATTR0]] { +// AMDGPU-NEXT: [[ENTRY:.*:]] +// AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z() +// AMDGPU-NEXT: ret i32 [[TMP0]] +// +// +// AMDGPU-LABEL: define internal i32 @__gpu_block_id( +// AMDGPU-SAME: i32 noundef [[__DIM:%.*]]) #[[ATTR0]] { +// AMDGPU-NEXT: [[ENTRY:.*:]] +// AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGPU-NEXT: [[__DIM_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGPU-NEXT: [[__DIM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__DIM_ADDR]] to ptr +// AMDGPU-NEXT: store i32 [[__DIM]], ptr [[__DIM_ADDR_ASCAST]], align 4 +// AMDGPU-NEXT: [[TMP0:%.*]] = load i32, ptr [[__DIM_ADDR_ASCAST]], align 4 +// AMDGPU-NEXT: switch i32 [[TMP0]], label %[[SW_DEFAULT:.*]] [ +// AMDGPU-NEXT: i32 0, label %[[SW_BB:.*]] +// AMDGPU-NEXT: i32 1, label %[[SW_BB1:.*]] +// AMDGPU-NEXT: i32 2, label %[[SW_BB3:.*]] +// AMDGPU-NEXT: ] +// AMDGPU: [[SW_BB]]: +// AMDGPU-NEXT: [[CALL:%.*]] = call i32 @__gpu_block_id_x() #[[ATTR7]] +// AMDGPU-NEXT: store i32 [[CALL]], ptr [[RETVAL_ASCAST]], align 4 +// AMDGPU-NEXT: br label %[[RETURN:.*]] +// AMDGPU: [[SW_BB1]]: +// AMDGPU-NEXT: [[CALL2:%.*]] = call i32 @__gpu_block_id_y() #[[ATTR7]] +// AMDGPU-NEXT: store i32 [[CALL2]], ptr [[RETVAL_ASCAST]], align 4 +// AMDGPU-NEXT: br label %[[RETURN]] +// AMDGPU: [[SW_BB3]]: +// AMDGPU-NEXT: [[CALL4:%.*]] = call i32 @__gpu_block_id_z() #[[ATTR7]] +// AMDGPU-NEXT: store i32 [[CALL4]], ptr [[RETVAL_ASCAST]], align 4 +// AMDGPU-NEXT: br label %[[RETURN]] +// AMDGPU: [[SW_DEFAULT]]: +// AMDGPU-NEXT: unreachable +// AMDGPU: [[RETURN]]: +// AMDGPU-NEXT: [[TMP1:%.*]] = load i32, ptr [[RETVAL_ASCAST]], align 4 +// AMDGPU-NEXT: ret i32 [[TMP1]] +// +// +// AMDGPU-LABEL: define internal i32 @__gpu_num_threads_x( +// AMDGPU-SAME: ) #[[ATTR0]] { +// AMDGPU-NEXT: [[ENTRY:.*:]] +// AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGPU-NEXT: [[TMP0:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +// AMDGPU-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP0]], i32 12 +// AMDGPU-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(4) [[TMP1]], align 2, !range [[RNG5]], !invariant.load [[META4]], !noundef [[META4]] +// AMDGPU-NEXT: [[CONV:%.*]] = zext i16 [[TMP2]] to i32 +// AMDGPU-NEXT: ret i32 [[CONV]] +// +// +// AMDGPU-LABEL: define internal i32 @__gpu_num_threads_y( +// AMDGPU-SAME: ) #[[ATTR0]] { +// AMDGPU-NEXT: [[ENTRY:.*:]] +// AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGPU-NEXT: [[TMP0:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +// AMDGPU-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP0]], i32 14 +// AMDGPU-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(4) [[TMP1]], align 2, !range [[RNG5]], !invariant.load [[META4]], !noundef [[META4]] +// AMDGPU-NEXT: [[CONV:%.*]] = zext i16 [[TMP2]] to i32 +// AMDGPU-NEXT: ret i32 [[CONV]] +// +// +// AMDGPU-LABEL: define internal i32 @__gpu_num_threads_z( +// AMDGPU-SAME: ) #[[ATTR0]] { +// AMDGPU-NEXT: [[ENTRY:.*:]] +// AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGPU-NEXT: [[TMP0:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +// AMDGPU-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP0]], i32 16 +// AMDGPU-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(4) [[TMP1]], align 2, !range [[RNG5]], !invariant.load [[META4]], !noundef [[META4]] +// AMDGPU-NEXT: [[CONV:%.*]] = zext i16 [[TMP2]] to i32 +// AMDGPU-NEXT: ret i32 [[CONV]] +// +// +// AMDGPU-LABEL: define internal i32 @__gpu_num_threads( +// AMDGPU-SAME: i32 noundef [[__DIM:%.*]]) #[[ATTR0]] { +// AMDGPU-NEXT: [[ENTRY:.*:]] +// AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGPU-NEXT: [[__DIM_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGPU-NEXT: [[__DIM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__DIM_ADDR]] to ptr +// AMDGPU-NEXT: store i32 [[__DIM]], ptr [[__DIM_ADDR_ASCAST]], align 4 +// AMDGPU-NEXT: [[TMP0:%.*]] = load i32, ptr [[__DIM_ADDR_ASCAST]], align 4 +// AMDGPU-NEXT: switch i32 [[TMP0]], label %[[SW_DEFAULT:.*]] [ +// AMDGPU-NEXT: i32 0, label %[[SW_BB:.*]] +// AMDGPU-NEXT: i32 1, label %[[SW_BB1:.*]] +// AMDGPU-NEXT: i32 2, label %[[SW_BB3:.*]] +// AMDGPU-NEXT: ] +// AMDGPU: [[SW_BB]]: +// AMDGPU-NEXT: [[CALL:%.*]] = call i32 @__gpu_num_threads_x() #[[ATTR7]] +// AMDGPU-NEXT: store i32 [[CALL]], ptr [[RETVAL_ASCAST]], align 4 +// AMDGPU-NEXT: br label %[[RETURN:.*]] +// AMDGPU: [[SW_BB1]]: +// AMDGPU-NEXT: [[CALL2:%.*]] = call i32 @__gpu_num_threads_y() #[[ATTR7]] +// AMDGPU-NEXT: store i32 [[CALL2]], ptr [[RETVAL_ASCAST]], align 4 +// AMDGPU-NEXT: br label %[[RETURN]] +// AMDGPU: [[SW_BB3]]: +// AMDGPU-NEXT: [[CALL4:%.*]] = call i32 @__gpu_num_threads_z() #[[ATTR7]] +// AMDGPU-NEXT: store i32 [[CALL4]], ptr [[RETVAL_ASCAST]], align 4 +// AMDGPU-NEXT: br label %[[RETURN]] +// AMDGPU: [[SW_DEFAULT]]: +// AMDGPU-NEXT: unreachable +// AMDGPU: [[RETURN]]: +// AMDGPU-NEXT: [[TMP1:%.*]] = load i32, ptr [[RETVAL_ASCAST]], align 4 +// AMDGPU-NEXT: ret i32 [[TMP1]] +// +// +// AMDGPU-LABEL: define internal i32 @__gpu_thread_id_x( +// AMDGPU-SAME: ) #[[ATTR0]] { +// AMDGPU-NEXT: [[ENTRY:.*:]] +// AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGPU-NEXT: [[TMP0:%.*]] = call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x() +// AMDGPU-NEXT: ret i32 [[TMP0]] +// +// +// AMDGPU-LABEL: define internal i32 @__gpu_thread_id_y( +// AMDGPU-SAME: ) #[[ATTR0]] { +// AMDGPU-NEXT: [[ENTRY:.*:]] +// AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGPU-NEXT: [[TMP0:%.*]] = call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.y() +// AMDGPU-NEXT: ret i32 [[TMP0]] +// +// +// AMDGPU-LABEL: define internal i32 @__gpu_thread_id_z( +// AMDGPU-SAME: ) #[[ATTR0]] { +// AMDGPU-NEXT: [[ENTRY:.*:]] +// AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGPU-NEXT: [[TMP0:%.*]] = call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.z() +// AMDGPU-NEXT: ret i32 [[TMP0]] +// +// // AMDGPU-LABEL: define internal i32 @__gpu_thread_id( // AMDGPU-SAME: i32 noundef [[__DIM:%.*]]) #[[ATTR0]] { // AMDGPU-NEXT: [[ENTRY:.*:]] @@ -122,6 +343,375 @@ __gpu_kernel void foo() { // AMDGPU-NEXT: [[TMP1:%.*]] = load i32, ptr [[RETVAL_ASCAST]], align 4 // AMDGPU-NEXT: ret i32 [[TMP1]] // +// +// AMDGPU-LABEL: define internal i32 @__gpu_num_lanes( +// AMDGPU-SAME: ) #[[ATTR0]] { +// AMDGPU-NEXT: [[ENTRY:.*:]] +// AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.wavefrontsize() +// AMDGPU-NEXT: ret i32 [[TMP0]] +// +// +// AMDGPU-LABEL: define internal i32 @__gpu_lane_id( +// AMDGPU-SAME: ) #[[ATTR0]] { +// AMDGPU-NEXT: [[ENTRY:.*:]] +// AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) +// AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[TMP0]]) +// AMDGPU-NEXT: ret i32 [[TMP1]] +// +// +// AMDGPU-LABEL: define internal i64 @__gpu_lane_mask( +// AMDGPU-SAME: ) #[[ATTR0]] { +// AMDGPU-NEXT: [[ENTRY:.*:]] +// AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i64, align 8, addrspace(5) +// AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGPU-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +// AMDGPU-NEXT: ret i64 [[TMP0]] +// +// +// AMDGPU-LABEL: define internal i32 @__gpu_read_first_lane_u32( +// AMDGPU-SAME: i64 noundef [[__LANE_MASK:%.*]], i32 noundef [[__X:%.*]]) #[[ATTR0]] { +// AMDGPU-NEXT: [[ENTRY:.*:]] +// AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGPU-NEXT: [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8, addrspace(5) +// AMDGPU-NEXT: [[__X_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGPU-NEXT: [[__LANE_MASK_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__LANE_MASK_ADDR]] to ptr +// AMDGPU-NEXT: [[__X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__X_ADDR]] to ptr +// AMDGPU-NEXT: store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR_ASCAST]], align 8 +// AMDGPU-NEXT: store i32 [[__X]], ptr [[__X_ADDR_ASCAST]], align 4 +// AMDGPU-NEXT: [[TMP0:%.*]] = load i32, ptr [[__X_ADDR_ASCAST]], align 4 +// AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP0]]) +// AMDGPU-NEXT: ret i32 [[TMP1]] +// +// +// AMDGPU-LABEL: define internal i64 @__gpu_ballot( +// AMDGPU-SAME: i64 noundef [[__LANE_MASK:%.*]], i1 noundef zeroext [[__X:%.*]]) #[[ATTR0]] { +// AMDGPU-NEXT: [[ENTRY:.*:]] +// AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i64, align 8, addrspace(5) +// AMDGPU-NEXT: [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8, addrspace(5) +// AMDGPU-NEXT: [[__X_ADDR:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGPU-NEXT: [[__LANE_MASK_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__LANE_MASK_ADDR]] to ptr +// AMDGPU-NEXT: [[__X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__X_ADDR]] to ptr +// AMDGPU-NEXT: store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR_ASCAST]], align 8 +// AMDGPU-NEXT: [[STOREDV:%.*]] = zext i1 [[__X]] to i8 +// AMDGPU-NEXT: store i8 [[STOREDV]], ptr [[__X_ADDR_ASCAST]], align 1 +// AMDGPU-NEXT: [[TMP0:%.*]] = load i64, ptr [[__LANE_MASK_ADDR_ASCAST]], align 8 +// AMDGPU-NEXT: [[TMP1:%.*]] = load i8, ptr [[__X_ADDR_ASCAST]], align 1 +// AMDGPU-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP1]] to i1 +// AMDGPU-NEXT: [[TMP2:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[LOADEDV]]) +// AMDGPU-NEXT: [[AND:%.*]] = and i64 [[TMP0]], [[TMP2]] +// AMDGPU-NEXT: ret i64 [[AND]] +// +// +// AMDGPU-LABEL: define internal void @__gpu_sync_threads( +// AMDGPU-SAME: ) #[[ATTR0]] { +// AMDGPU-NEXT: [[ENTRY:.*:]] +// AMDGPU-NEXT: call void @llvm.amdgcn.s.barrier() +// AMDGPU-NEXT: fence syncscope("workgroup") seq_cst +// AMDGPU-NEXT: ret void +// +// +// AMDGPU-LABEL: define internal void @__gpu_sync_lane( +// AMDGPU-SAME: i64 noundef [[__LANE_MASK:%.*]]) #[[ATTR0]] { +// AMDGPU-NEXT: [[ENTRY:.*:]] +// AMDGPU-NEXT: [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8, addrspace(5) +// AMDGPU-NEXT: [[__LANE_MASK_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__LANE_MASK_ADDR]] to ptr +// AMDGPU-NEXT: store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR_ASCAST]], align 8 +// AMDGPU-NEXT: call void @llvm.amdgcn.wave.barrier() +// AMDGPU-NEXT: ret void +// +// +// AMDGPU-LABEL: define internal i32 @__gpu_shuffle_idx_u32( +// AMDGPU-SAME: i64 noundef [[__LANE_MASK:%.*]], i32 noundef [[__IDX:%.*]], i32 noundef [[__X:%.*]], i32 noundef [[__WIDTH:%.*]]) #[[ATTR0]] { +// AMDGPU-NEXT: [[ENTRY:.*:]] +// AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGPU-NEXT: [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8, addrspace(5) +// AMDGPU-NEXT: [[__IDX_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGPU-NEXT: [[__X_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGPU-NEXT: [[__WIDTH_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGPU-NEXT: [[__LANE:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGPU-NEXT: [[__LANE_MASK_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__LANE_MASK_ADDR]] to ptr +// AMDGPU-NEXT: [[__IDX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__IDX_ADDR]] to ptr +// AMDGPU-NEXT: [[__X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__X_ADDR]] to ptr +// AMDGPU-NEXT: [[__WIDTH_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__WIDTH_ADDR]] to ptr +// AMDGPU-NEXT: [[__LANE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__LANE]] to ptr +// AMDGPU-NEXT: store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR_ASCAST]], align 8 +// AMDGPU-NEXT: store i32 [[__IDX]], ptr [[__IDX_ADDR_ASCAST]], align 4 +// AMDGPU-NEXT: store i32 [[__X]], ptr [[__X_ADDR_ASCAST]], align 4 +// AMDGPU-NEXT: store i32 [[__WIDTH]], ptr [[__WIDTH_ADDR_ASCAST]], align 4 +// AMDGPU-NEXT: [[TMP0:%.*]] = load i32, ptr [[__IDX_ADDR_ASCAST]], align 4 +// AMDGPU-NEXT: [[CALL:%.*]] = call i32 @__gpu_lane_id() #[[ATTR7]] +// AMDGPU-NEXT: [[TMP1:%.*]] = load i32, ptr [[__WIDTH_ADDR_ASCAST]], align 4 +// AMDGPU-NEXT: [[SUB:%.*]] = sub i32 [[TMP1]], 1 +// AMDGPU-NEXT: [[NOT:%.*]] = xor i32 [[SUB]], -1 +// AMDGPU-NEXT: [[AND:%.*]] = and i32 [[CALL]], [[NOT]] +// AMDGPU-NEXT: [[ADD:%.*]] = add i32 [[TMP0]], [[AND]] +// AMDGPU-NEXT: store i32 [[ADD]], ptr [[__LANE_ASCAST]], align 4 +// AMDGPU-NEXT: [[TMP2:%.*]] = load i32, ptr [[__LANE_ASCAST]], align 4 +// AMDGPU-NEXT: [[SHL:%.*]] = shl i32 [[TMP2]], 2 +// AMDGPU-NEXT: [[TMP3:%.*]] = load i32, ptr [[__X_ADDR_ASCAST]], align 4 +// AMDGPU-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.ds.bpermute(i32 [[SHL]], i32 [[TMP3]]) +// AMDGPU-NEXT: ret i32 [[TMP4]] +// +// +// AMDGPU-LABEL: define internal i64 @__gpu_first_lane_id( +// AMDGPU-SAME: i64 noundef [[__LANE_MASK:%.*]]) #[[ATTR0]] { +// AMDGPU-NEXT: [[ENTRY:.*:]] +// AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i64, align 8, addrspace(5) +// AMDGPU-NEXT: [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8, addrspace(5) +// AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGPU-NEXT: [[__LANE_MASK_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__LANE_MASK_ADDR]] to ptr +// AMDGPU-NEXT: store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR_ASCAST]], align 8 +// AMDGPU-NEXT: [[TMP0:%.*]] = load i64, ptr [[__LANE_MASK_ADDR_ASCAST]], align 8 +// AMDGPU-NEXT: [[TMP1:%.*]] = call i64 @llvm.cttz.i64(i64 [[TMP0]], i1 true) +// AMDGPU-NEXT: [[TMP2:%.*]] = add i64 [[TMP1]], 1 +// AMDGPU-NEXT: [[ISZERO:%.*]] = icmp eq i64 [[TMP0]], 0 +// AMDGPU-NEXT: [[FFS:%.*]] = select i1 [[ISZERO]], i64 0, i64 [[TMP2]] +// AMDGPU-NEXT: [[CAST:%.*]] = trunc i64 [[FFS]] to i32 +// AMDGPU-NEXT: [[SUB:%.*]] = sub nsw i32 [[CAST]], 1 +// AMDGPU-NEXT: [[CONV:%.*]] = sext i32 [[SUB]] to i64 +// AMDGPU-NEXT: ret i64 [[CONV]] +// +// +// AMDGPU-LABEL: define internal zeroext i1 @__gpu_is_first_in_lane( +// AMDGPU-SAME: i64 noundef [[__LANE_MASK:%.*]]) #[[ATTR0]] { +// AMDGPU-NEXT: [[ENTRY:.*:]] +// AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5) +// AMDGPU-NEXT: [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8, addrspace(5) +// AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGPU-NEXT: [[__LANE_MASK_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__LANE_MASK_ADDR]] to ptr +// AMDGPU-NEXT: store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR_ASCAST]], align 8 +// AMDGPU-NEXT: [[CALL:%.*]] = call i32 @__gpu_lane_id() #[[ATTR7]] +// AMDGPU-NEXT: [[CONV:%.*]] = zext i32 [[CALL]] to i64 +// AMDGPU-NEXT: [[TMP0:%.*]] = load i64, ptr [[__LANE_MASK_ADDR_ASCAST]], align 8 +// AMDGPU-NEXT: [[CALL1:%.*]] = call i64 @__gpu_first_lane_id(i64 noundef [[TMP0]]) #[[ATTR7]] +// AMDGPU-NEXT: [[CMP:%.*]] = icmp eq i64 [[CONV]], [[CALL1]] +// AMDGPU-NEXT: ret i1 [[CMP]] +// +// +// AMDGPU-LABEL: define internal void @__gpu_exit( +// AMDGPU-SAME: ) #[[ATTR1:[0-9]+]] { +// AMDGPU-NEXT: [[ENTRY:.*:]] +// AMDGPU-NEXT: call void @llvm.amdgcn.endpgm() +// AMDGPU-NEXT: ret void +// +// +// NVPTX-LABEL: define protected ptx_kernel void @foo( +// NVPTX-SAME: ) #[[ATTR0:[0-9]+]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[CALL:%.*]] = call i32 @__gpu_num_blocks_x() #[[ATTR6:[0-9]+]] +// NVPTX-NEXT: [[CALL1:%.*]] = call i32 @__gpu_num_blocks_y() #[[ATTR6]] +// NVPTX-NEXT: [[CALL2:%.*]] = call i32 @__gpu_num_blocks_z() #[[ATTR6]] +// NVPTX-NEXT: [[CALL3:%.*]] = call i32 @__gpu_num_blocks(i32 noundef 0) #[[ATTR6]] +// NVPTX-NEXT: [[CALL4:%.*]] = call i32 @__gpu_block_id_x() #[[ATTR6]] +// NVPTX-NEXT: [[CALL5:%.*]] = call i32 @__gpu_block_id_y() #[[ATTR6]] +// NVPTX-NEXT: [[CALL6:%.*]] = call i32 @__gpu_block_id_z() #[[ATTR6]] +// NVPTX-NEXT: [[CALL7:%.*]] = call i32 @__gpu_block_id(i32 noundef 0) #[[ATTR6]] +// NVPTX-NEXT: [[CALL8:%.*]] = call i32 @__gpu_num_threads_x() #[[ATTR6]] +// NVPTX-NEXT: [[CALL9:%.*]] = call i32 @__gpu_num_threads_y() #[[ATTR6]] +// NVPTX-NEXT: [[CALL10:%.*]] = call i32 @__gpu_num_threads_z() #[[ATTR6]] +// NVPTX-NEXT: [[CALL11:%.*]] = call i32 @__gpu_num_threads(i32 noundef 0) #[[ATTR6]] +// NVPTX-NEXT: [[CALL12:%.*]] = call i32 @__gpu_thread_id_x() #[[ATTR6]] +// NVPTX-NEXT: [[CALL13:%.*]] = call i32 @__gpu_thread_id_y() #[[ATTR6]] +// NVPTX-NEXT: [[CALL14:%.*]] = call i32 @__gpu_thread_id_z() #[[ATTR6]] +// NVPTX-NEXT: [[CALL15:%.*]] = call i32 @__gpu_thread_id(i32 noundef 0) #[[ATTR6]] +// NVPTX-NEXT: [[CALL16:%.*]] = call i32 @__gpu_num_lanes() #[[ATTR6]] +// NVPTX-NEXT: [[CALL17:%.*]] = call i32 @__gpu_lane_id() #[[ATTR6]] +// NVPTX-NEXT: [[CALL18:%.*]] = call i64 @__gpu_lane_mask() #[[ATTR6]] +// NVPTX-NEXT: [[CALL19:%.*]] = call i32 @__gpu_read_first_lane_u32(i64 noundef -1, i32 noundef -1) #[[ATTR6]] +// NVPTX-NEXT: [[CALL20:%.*]] = call i64 @__gpu_ballot(i64 noundef -1, i1 noundef zeroext true) #[[ATTR6]] +// NVPTX-NEXT: call void @__gpu_sync_threads() #[[ATTR6]] +// NVPTX-NEXT: call void @__gpu_sync_lane(i64 noundef -1) #[[ATTR6]] +// NVPTX-NEXT: [[CALL21:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1, i32 noundef 0) #[[ATTR6]] +// NVPTX-NEXT: [[CALL22:%.*]] = call i64 @__gpu_first_lane_id(i64 noundef -1) #[[ATTR6]] +// NVPTX-NEXT: [[CALL23:%.*]] = call zeroext i1 @__gpu_is_first_in_lane(i64 noundef -1) #[[ATTR6]] +// NVPTX-NEXT: call void @__gpu_exit() #[[ATTR7:[0-9]+]] +// NVPTX-NEXT: unreachable +// +// +// NVPTX-LABEL: define internal i32 @__gpu_num_blocks_x( +// NVPTX-SAME: ) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[TMP0:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() +// NVPTX-NEXT: ret i32 [[TMP0]] +// +// +// NVPTX-LABEL: define internal i32 @__gpu_num_blocks_y( +// NVPTX-SAME: ) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[TMP0:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y() +// NVPTX-NEXT: ret i32 [[TMP0]] +// +// +// NVPTX-LABEL: define internal i32 @__gpu_num_blocks_z( +// NVPTX-SAME: ) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[TMP0:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z() +// NVPTX-NEXT: ret i32 [[TMP0]] +// +// +// NVPTX-LABEL: define internal i32 @__gpu_num_blocks( +// NVPTX-SAME: i32 noundef [[__DIM:%.*]]) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[__DIM_ADDR:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: store i32 [[__DIM]], ptr [[__DIM_ADDR]], align 4 +// NVPTX-NEXT: [[TMP0:%.*]] = load i32, ptr [[__DIM_ADDR]], align 4 +// NVPTX-NEXT: switch i32 [[TMP0]], label %[[SW_DEFAULT:.*]] [ +// NVPTX-NEXT: i32 0, label %[[SW_BB:.*]] +// NVPTX-NEXT: i32 1, label %[[SW_BB1:.*]] +// NVPTX-NEXT: i32 2, label %[[SW_BB3:.*]] +// NVPTX-NEXT: ] +// NVPTX: [[SW_BB]]: +// NVPTX-NEXT: [[CALL:%.*]] = call i32 @__gpu_num_blocks_x() #[[ATTR6]] +// NVPTX-NEXT: store i32 [[CALL]], ptr [[RETVAL]], align 4 +// NVPTX-NEXT: br label %[[RETURN:.*]] +// NVPTX: [[SW_BB1]]: +// NVPTX-NEXT: [[CALL2:%.*]] = call i32 @__gpu_num_blocks_y() #[[ATTR6]] +// NVPTX-NEXT: store i32 [[CALL2]], ptr [[RETVAL]], align 4 +// NVPTX-NEXT: br label %[[RETURN]] +// NVPTX: [[SW_BB3]]: +// NVPTX-NEXT: [[CALL4:%.*]] = call i32 @__gpu_num_blocks_z() #[[ATTR6]] +// NVPTX-NEXT: store i32 [[CALL4]], ptr [[RETVAL]], align 4 +// NVPTX-NEXT: br label %[[RETURN]] +// NVPTX: [[SW_DEFAULT]]: +// NVPTX-NEXT: unreachable +// NVPTX: [[RETURN]]: +// NVPTX-NEXT: [[TMP1:%.*]] = load i32, ptr [[RETVAL]], align 4 +// NVPTX-NEXT: ret i32 [[TMP1]] +// +// +// NVPTX-LABEL: define internal i32 @__gpu_block_id_x( +// NVPTX-SAME: ) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[TMP0:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() +// NVPTX-NEXT: ret i32 [[TMP0]] +// +// +// NVPTX-LABEL: define internal i32 @__gpu_block_id_y( +// NVPTX-SAME: ) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[TMP0:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() +// NVPTX-NEXT: ret i32 [[TMP0]] +// +// +// NVPTX-LABEL: define internal i32 @__gpu_block_id_z( +// NVPTX-SAME: ) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[TMP0:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() +// NVPTX-NEXT: ret i32 [[TMP0]] +// +// +// NVPTX-LABEL: define internal i32 @__gpu_block_id( +// NVPTX-SAME: i32 noundef [[__DIM:%.*]]) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[__DIM_ADDR:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: store i32 [[__DIM]], ptr [[__DIM_ADDR]], align 4 +// NVPTX-NEXT: [[TMP0:%.*]] = load i32, ptr [[__DIM_ADDR]], align 4 +// NVPTX-NEXT: switch i32 [[TMP0]], label %[[SW_DEFAULT:.*]] [ +// NVPTX-NEXT: i32 0, label %[[SW_BB:.*]] +// NVPTX-NEXT: i32 1, label %[[SW_BB1:.*]] +// NVPTX-NEXT: i32 2, label %[[SW_BB3:.*]] +// NVPTX-NEXT: ] +// NVPTX: [[SW_BB]]: +// NVPTX-NEXT: [[CALL:%.*]] = call i32 @__gpu_block_id_x() #[[ATTR6]] +// NVPTX-NEXT: store i32 [[CALL]], ptr [[RETVAL]], align 4 +// NVPTX-NEXT: br label %[[RETURN:.*]] +// NVPTX: [[SW_BB1]]: +// NVPTX-NEXT: [[CALL2:%.*]] = call i32 @__gpu_block_id_y() #[[ATTR6]] +// NVPTX-NEXT: store i32 [[CALL2]], ptr [[RETVAL]], align 4 +// NVPTX-NEXT: br label %[[RETURN]] +// NVPTX: [[SW_BB3]]: +// NVPTX-NEXT: [[CALL4:%.*]] = call i32 @__gpu_block_id_z() #[[ATTR6]] +// NVPTX-NEXT: store i32 [[CALL4]], ptr [[RETVAL]], align 4 +// NVPTX-NEXT: br label %[[RETURN]] +// NVPTX: [[SW_DEFAULT]]: +// NVPTX-NEXT: unreachable +// NVPTX: [[RETURN]]: +// NVPTX-NEXT: [[TMP1:%.*]] = load i32, ptr [[RETVAL]], align 4 +// NVPTX-NEXT: ret i32 [[TMP1]] +// +// +// NVPTX-LABEL: define internal i32 @__gpu_num_threads_x( +// NVPTX-SAME: ) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[TMP0:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// NVPTX-NEXT: ret i32 [[TMP0]] +// +// +// NVPTX-LABEL: define internal i32 @__gpu_num_threads_y( +// NVPTX-SAME: ) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[TMP0:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.y() +// NVPTX-NEXT: ret i32 [[TMP0]] +// +// +// NVPTX-LABEL: define internal i32 @__gpu_num_threads_z( +// NVPTX-SAME: ) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[TMP0:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.z() +// NVPTX-NEXT: ret i32 [[TMP0]] +// +// +// NVPTX-LABEL: define internal i32 @__gpu_num_threads( +// NVPTX-SAME: i32 noundef [[__DIM:%.*]]) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[__DIM_ADDR:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: store i32 [[__DIM]], ptr [[__DIM_ADDR]], align 4 +// NVPTX-NEXT: [[TMP0:%.*]] = load i32, ptr [[__DIM_ADDR]], align 4 +// NVPTX-NEXT: switch i32 [[TMP0]], label %[[SW_DEFAULT:.*]] [ +// NVPTX-NEXT: i32 0, label %[[SW_BB:.*]] +// NVPTX-NEXT: i32 1, label %[[SW_BB1:.*]] +// NVPTX-NEXT: i32 2, label %[[SW_BB3:.*]] +// NVPTX-NEXT: ] +// NVPTX: [[SW_BB]]: +// NVPTX-NEXT: [[CALL:%.*]] = call i32 @__gpu_num_threads_x() #[[ATTR6]] +// NVPTX-NEXT: store i32 [[CALL]], ptr [[RETVAL]], align 4 +// NVPTX-NEXT: br label %[[RETURN:.*]] +// NVPTX: [[SW_BB1]]: +// NVPTX-NEXT: [[CALL2:%.*]] = call i32 @__gpu_num_threads_y() #[[ATTR6]] +// NVPTX-NEXT: store i32 [[CALL2]], ptr [[RETVAL]], align 4 +// NVPTX-NEXT: br label %[[RETURN]] +// NVPTX: [[SW_BB3]]: +// NVPTX-NEXT: [[CALL4:%.*]] = call i32 @__gpu_num_threads_z() #[[ATTR6]] +// NVPTX-NEXT: store i32 [[CALL4]], ptr [[RETVAL]], align 4 +// NVPTX-NEXT: br label %[[RETURN]] +// NVPTX: [[SW_DEFAULT]]: +// NVPTX-NEXT: unreachable +// NVPTX: [[RETURN]]: +// NVPTX-NEXT: [[TMP1:%.*]] = load i32, ptr [[RETVAL]], align 4 +// NVPTX-NEXT: ret i32 [[TMP1]] +// +// +// NVPTX-LABEL: define internal i32 @__gpu_thread_id_x( +// NVPTX-SAME: ) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[TMP0:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// NVPTX-NEXT: ret i32 [[TMP0]] +// +// +// NVPTX-LABEL: define internal i32 @__gpu_thread_id_y( +// NVPTX-SAME: ) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[TMP0:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.y() +// NVPTX-NEXT: ret i32 [[TMP0]] +// +// +// NVPTX-LABEL: define internal i32 @__gpu_thread_id_z( +// NVPTX-SAME: ) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[TMP0:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.z() +// NVPTX-NEXT: ret i32 [[TMP0]] +// +// // NVPTX-LABEL: define internal i32 @__gpu_thread_id( // NVPTX-SAME: i32 noundef [[__DIM:%.*]]) #[[ATTR0]] { // NVPTX-NEXT: [[ENTRY:.*:]] @@ -152,18 +742,173 @@ __gpu_kernel void foo() { // NVPTX-NEXT: [[TMP1:%.*]] = load i32, ptr [[RETVAL]], align 4 // NVPTX-NEXT: ret i32 [[TMP1]] // - __gpu_thread_id_y(); - __gpu_thread_id_z(); - __gpu_thread_id(0); - __gpu_num_lanes(); - __gpu_lane_id(); - __gpu_lane_mask(); - __gpu_read_first_lane_u32(-1, -1); - __gpu_ballot(-1, 1); - __gpu_sync_threads(); - __gpu_sync_lane(-1); - __gpu_shuffle_idx_u32(-1, -1, -1, 0); - __gpu_first_lane_id(-1); - __gpu_is_first_in_lane(-1); - __gpu_exit(); -} +// +// NVPTX-LABEL: define internal i32 @__gpu_num_lanes( +// NVPTX-SAME: ) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[TMP0:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// NVPTX-NEXT: ret i32 [[TMP0]] +// +// +// NVPTX-LABEL: define internal i32 @__gpu_lane_id( +// NVPTX-SAME: ) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[TMP0:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.laneid() +// NVPTX-NEXT: ret i32 [[TMP0]] +// +// +// NVPTX-LABEL: define internal i64 @__gpu_lane_mask( +// NVPTX-SAME: ) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[TMP0:%.*]] = call i32 @llvm.nvvm.activemask() +// NVPTX-NEXT: [[CONV:%.*]] = zext i32 [[TMP0]] to i64 +// NVPTX-NEXT: ret i64 [[CONV]] +// +// +// NVPTX-LABEL: define internal i32 @__gpu_read_first_lane_u32( +// NVPTX-SAME: i64 noundef [[__LANE_MASK:%.*]], i32 noundef [[__X:%.*]]) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8 +// NVPTX-NEXT: [[__X_ADDR:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[__MASK:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[__ID:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR]], align 8 +// NVPTX-NEXT: store i32 [[__X]], ptr [[__X_ADDR]], align 4 +// NVPTX-NEXT: [[TMP0:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8 +// NVPTX-NEXT: [[CONV:%.*]] = trunc i64 [[TMP0]] to i32 +// NVPTX-NEXT: store i32 [[CONV]], ptr [[__MASK]], align 4 +// NVPTX-NEXT: [[TMP1:%.*]] = load i32, ptr [[__MASK]], align 4 +// NVPTX-NEXT: [[TMP2:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP1]], i1 true) +// NVPTX-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], 1 +// NVPTX-NEXT: [[ISZERO:%.*]] = icmp eq i32 [[TMP1]], 0 +// NVPTX-NEXT: [[FFS:%.*]] = select i1 [[ISZERO]], i32 0, i32 [[TMP3]] +// NVPTX-NEXT: [[SUB:%.*]] = sub nsw i32 [[FFS]], 1 +// NVPTX-NEXT: store i32 [[SUB]], ptr [[__ID]], align 4 +// NVPTX-NEXT: [[TMP4:%.*]] = load i32, ptr [[__MASK]], align 4 +// NVPTX-NEXT: [[TMP5:%.*]] = load i32, ptr [[__X_ADDR]], align 4 +// NVPTX-NEXT: [[TMP6:%.*]] = load i32, ptr [[__ID]], align 4 +// NVPTX-NEXT: [[CALL:%.*]] = call i32 @__gpu_num_lanes() #[[ATTR6]] +// NVPTX-NEXT: [[SUB1:%.*]] = sub i32 [[CALL]], 1 +// NVPTX-NEXT: [[TMP7:%.*]] = call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 [[TMP4]], i32 [[TMP5]], i32 [[TMP6]], i32 [[SUB1]]) +// NVPTX-NEXT: ret i32 [[TMP7]] +// +// +// NVPTX-LABEL: define internal i64 @__gpu_ballot( +// NVPTX-SAME: i64 noundef [[__LANE_MASK:%.*]], i1 noundef zeroext [[__X:%.*]]) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8 +// NVPTX-NEXT: [[__X_ADDR:%.*]] = alloca i8, align 1 +// NVPTX-NEXT: [[__MASK:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR]], align 8 +// NVPTX-NEXT: [[STOREDV:%.*]] = zext i1 [[__X]] to i8 +// NVPTX-NEXT: store i8 [[STOREDV]], ptr [[__X_ADDR]], align 1 +// NVPTX-NEXT: [[TMP0:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8 +// NVPTX-NEXT: [[CONV:%.*]] = trunc i64 [[TMP0]] to i32 +// NVPTX-NEXT: store i32 [[CONV]], ptr [[__MASK]], align 4 +// NVPTX-NEXT: [[TMP1:%.*]] = load i32, ptr [[__MASK]], align 4 +// NVPTX-NEXT: [[TMP2:%.*]] = load i8, ptr [[__X_ADDR]], align 1 +// NVPTX-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP2]] to i1 +// NVPTX-NEXT: [[TMP3:%.*]] = call i32 @llvm.nvvm.vote.ballot.sync(i32 [[TMP1]], i1 [[LOADEDV]]) +// NVPTX-NEXT: [[CONV1:%.*]] = zext i32 [[TMP3]] to i64 +// NVPTX-NEXT: ret i64 [[CONV1]] +// +// +// NVPTX-LABEL: define internal void @__gpu_sync_threads( +// NVPTX-SAME: ) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: call void @llvm.nvvm.barrier0() +// NVPTX-NEXT: ret void +// +// +// NVPTX-LABEL: define internal void @__gpu_sync_lane( +// NVPTX-SAME: i64 noundef [[__LANE_MASK:%.*]]) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8 +// NVPTX-NEXT: store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR]], align 8 +// NVPTX-NEXT: [[TMP0:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8 +// NVPTX-NEXT: [[CONV:%.*]] = trunc i64 [[TMP0]] to i32 +// NVPTX-NEXT: call void @llvm.nvvm.bar.warp.sync(i32 [[CONV]]) +// NVPTX-NEXT: ret void +// +// +// NVPTX-LABEL: define internal i32 @__gpu_shuffle_idx_u32( +// NVPTX-SAME: i64 noundef [[__LANE_MASK:%.*]], i32 noundef [[__IDX:%.*]], i32 noundef [[__X:%.*]], i32 noundef [[__WIDTH:%.*]]) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8 +// NVPTX-NEXT: [[__IDX_ADDR:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[__X_ADDR:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[__WIDTH_ADDR:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[__MASK:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[__BITMASK:%.*]] = alloca i8, align 1 +// NVPTX-NEXT: store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR]], align 8 +// NVPTX-NEXT: store i32 [[__IDX]], ptr [[__IDX_ADDR]], align 4 +// NVPTX-NEXT: store i32 [[__X]], ptr [[__X_ADDR]], align 4 +// NVPTX-NEXT: store i32 [[__WIDTH]], ptr [[__WIDTH_ADDR]], align 4 +// NVPTX-NEXT: [[TMP0:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8 +// NVPTX-NEXT: [[CONV:%.*]] = trunc i64 [[TMP0]] to i32 +// NVPTX-NEXT: store i32 [[CONV]], ptr [[__MASK]], align 4 +// NVPTX-NEXT: [[TMP1:%.*]] = load i32, ptr [[__IDX_ADDR]], align 4 +// NVPTX-NEXT: [[SH_PROM:%.*]] = zext i32 [[TMP1]] to i64 +// NVPTX-NEXT: [[SHL:%.*]] = shl i64 1, [[SH_PROM]] +// NVPTX-NEXT: [[TMP2:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8 +// NVPTX-NEXT: [[AND:%.*]] = and i64 [[SHL]], [[TMP2]] +// NVPTX-NEXT: [[TOBOOL:%.*]] = icmp ne i64 [[AND]], 0 +// NVPTX-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// NVPTX-NEXT: store i8 [[STOREDV]], ptr [[__BITMASK]], align 1 +// NVPTX-NEXT: [[TMP3:%.*]] = load i8, ptr [[__BITMASK]], align 1 +// NVPTX-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1 +// NVPTX-NEXT: [[CONV1:%.*]] = zext i1 [[LOADEDV]] to i32 +// NVPTX-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[CONV1]] +// NVPTX-NEXT: [[TMP4:%.*]] = load i32, ptr [[__MASK]], align 4 +// NVPTX-NEXT: [[TMP5:%.*]] = load i32, ptr [[__X_ADDR]], align 4 +// NVPTX-NEXT: [[TMP6:%.*]] = load i32, ptr [[__IDX_ADDR]], align 4 +// NVPTX-NEXT: [[CALL:%.*]] = call i32 @__gpu_num_lanes() #[[ATTR6]] +// NVPTX-NEXT: [[TMP7:%.*]] = load i32, ptr [[__WIDTH_ADDR]], align 4 +// NVPTX-NEXT: [[SUB2:%.*]] = sub i32 [[CALL]], [[TMP7]] +// NVPTX-NEXT: [[SHL3:%.*]] = shl i32 [[SUB2]], 8 +// NVPTX-NEXT: [[OR:%.*]] = or i32 [[SHL3]], 31 +// NVPTX-NEXT: [[TMP8:%.*]] = call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 [[TMP4]], i32 [[TMP5]], i32 [[TMP6]], i32 [[OR]]) +// NVPTX-NEXT: [[AND4:%.*]] = and i32 [[SUB]], [[TMP8]] +// NVPTX-NEXT: ret i32 [[AND4]] +// +// +// NVPTX-LABEL: define internal i64 @__gpu_first_lane_id( +// NVPTX-SAME: i64 noundef [[__LANE_MASK:%.*]]) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8 +// NVPTX-NEXT: store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR]], align 8 +// NVPTX-NEXT: [[TMP0:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8 +// NVPTX-NEXT: [[TMP1:%.*]] = call i64 @llvm.cttz.i64(i64 [[TMP0]], i1 true) +// NVPTX-NEXT: [[TMP2:%.*]] = add i64 [[TMP1]], 1 +// NVPTX-NEXT: [[ISZERO:%.*]] = icmp eq i64 [[TMP0]], 0 +// NVPTX-NEXT: [[FFS:%.*]] = select i1 [[ISZERO]], i64 0, i64 [[TMP2]] +// NVPTX-NEXT: [[CAST:%.*]] = trunc i64 [[FFS]] to i32 +// NVPTX-NEXT: [[SUB:%.*]] = sub nsw i32 [[CAST]], 1 +// NVPTX-NEXT: [[CONV:%.*]] = sext i32 [[SUB]] to i64 +// NVPTX-NEXT: ret i64 [[CONV]] +// +// +// NVPTX-LABEL: define internal zeroext i1 @__gpu_is_first_in_lane( +// NVPTX-SAME: i64 noundef [[__LANE_MASK:%.*]]) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8 +// NVPTX-NEXT: store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR]], align 8 +// NVPTX-NEXT: [[CALL:%.*]] = call i32 @__gpu_lane_id() #[[ATTR6]] +// NVPTX-NEXT: [[CONV:%.*]] = zext i32 [[CALL]] to i64 +// NVPTX-NEXT: [[TMP0:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8 +// NVPTX-NEXT: [[CALL1:%.*]] = call i64 @__gpu_first_lane_id(i64 noundef [[TMP0]]) #[[ATTR6]] +// NVPTX-NEXT: [[CMP:%.*]] = icmp eq i64 [[CONV]], [[CALL1]] +// NVPTX-NEXT: ret i1 [[CMP]] +// +// +// NVPTX-LABEL: define internal void @__gpu_exit( +// NVPTX-SAME: ) #[[ATTR1:[0-9]+]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: call void @llvm.nvvm.exit() +// NVPTX-NEXT: ret void +// +//. +// AMDGPU: [[RNG3]] = !{i32 1, i32 0} +// AMDGPU: [[META4]] = !{} +// AMDGPU: [[RNG5]] = !{i16 1, i16 1025} +//. _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits