Author: Joseph Huber
Date: 2026-01-09T08:58:17-06:00
New Revision: 5c4324326d770bab1628225ebb1a04698a27b59b

URL: 
https://github.com/llvm/llvm-project/commit/5c4324326d770bab1628225ebb1a04698a27b59b
DIFF: 
https://github.com/llvm/llvm-project/commit/5c4324326d770bab1628225ebb1a04698a27b59b.diff

LOG: [SPIR-V] Initial support for SPIR-V in `gpuintrin.h` (#174910)

Summary:
https://github.com/llvm/llvm-project/pull/174862 and
https://github.com/llvm/llvm-project/pull/174655 provided the intrinsics
required to get the fundamental operations working for these. This patch
sets up the basic support (as far as I know).

This should be the first step towards allowing SPIR-V to build things
like the LLVM libc and the OpenMP Device Runtime Library. The
implementations here are intentionally inefficient, such as not using
the dedicated SPIR-V opcode for read firstlane. This is just to start
and hopefully start testing things later.

Would appreciate someone more familiar with  the backend double-checking
these.

Added: 
    clang/lib/Headers/spirvintrin.h

Modified: 
    clang/lib/Headers/CMakeLists.txt
    clang/lib/Headers/gpuintrin.h
    clang/test/Headers/gpuintrin.c
    clang/test/Headers/gpuintrin_lang.c

Removed: 
    


################################################################################
diff  --git a/clang/lib/Headers/CMakeLists.txt 
b/clang/lib/Headers/CMakeLists.txt
index 1b96ac417bf70..c92b370b88d2d 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -297,6 +297,7 @@ set(gpu_files
   gpuintrin.h
   nvptxintrin.h
   amdgpuintrin.h
+  spirvintrin.h
   )
 
 set(windows_only_files

diff  --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index cea19f0f74464..f3cf2d0776c0c 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -60,6 +60,8 @@ _Pragma("omp end declare target");
 #include <nvptxintrin.h>
 #elif defined(__AMDGPU__)
 #include <amdgpuintrin.h>
+#elif defined(__SPIRV__)
+#include <spirvintrin.h>
 #elif !defined(_OPENMP)
 #error "This header is only meant to be used on GPU architectures."
 #endif

diff  --git a/clang/lib/Headers/spirvintrin.h b/clang/lib/Headers/spirvintrin.h
new file mode 100644
index 0000000000000..2a10a47adedde
--- /dev/null
+++ b/clang/lib/Headers/spirvintrin.h
@@ -0,0 +1,194 @@
+//===-- spirvintrin.h - SPIR-V intrinsic functions 
------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __SPIRVINTRIN_H
+#define __SPIRVINTRIN_H
+
+#ifndef __SPIRV__
+#error "This file is intended for SPIR-V targets or offloading to SPIR-V"
+#endif
+
+#ifndef __GPUINTRIN_H
+#error "Never use <spirvintrin.h> directly; include <gpuintrin.h> instead"
+#endif
+
+_Pragma("omp begin declare target device_type(nohost)");
+_Pragma("omp begin declare variant match(device = {arch(spirv64)})");
+
+// Type aliases to the address spaces used by the SPIR-V backend.
+#define __gpu_private __attribute__((address_space(0)))
+#define __gpu_constant __attribute__((address_space(2)))
+#define __gpu_local __attribute__((address_space(3)))
+#define __gpu_global __attribute__((address_space(1)))
+#define __gpu_generic __attribute__((address_space(4)))
+
+// Attribute to declare a function as a kernel.
+#define __gpu_kernel __attribute__((device_kernel, visibility("protected")))
+
+// Returns the number of workgroups in the 'x' dimension of the grid.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
+  return __builtin_spirv_num_workgroups(0);
+}
+
+// Returns the number of workgroups in the 'y' dimension of the grid.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_y(void) {
+  return __builtin_spirv_num_workgroups(1);
+}
+
+// Returns the number of workgroups in the 'z' dimension of the grid.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_z(void) {
+  return __builtin_spirv_num_workgroups(2);
+}
+
+// Returns the 'x' dimension of the current workgroup's id.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_x(void) {
+  return __builtin_spirv_workgroup_id(0);
+}
+
+// Returns the 'y' dimension of the current workgroup's id.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_y(void) {
+  return __builtin_spirv_workgroup_id(1);
+}
+
+// Returns the 'z' dimension of the current workgroup's id.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_z(void) {
+  return __builtin_spirv_workgroup_id(2);
+}
+
+// Returns the number of workitems in the 'x' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_x(void) {
+  return __builtin_spirv_workgroup_size(0);
+}
+
+// Returns the number of workitems in the 'y' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_y(void) {
+  return __builtin_spirv_workgroup_size(1);
+}
+
+// Returns the number of workitems in the 'z' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_z(void) {
+  return __builtin_spirv_workgroup_size(2);
+}
+
+// Returns the 'x' dimension id of the workitem in the current workgroup.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_x(void) {
+  return __builtin_spirv_local_invocation_id(0);
+}
+
+// Returns the 'y' dimension id of the workitem in the current workgroup.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_y(void) {
+  return __builtin_spirv_local_invocation_id(1);
+}
+
+// Returns the 'z' dimension id of the workitem in the current workgroup.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_z(void) {
+  return __builtin_spirv_local_invocation_id(2);
+}
+
+// Returns the size of an wavefront, either 32 or 64 depending on hardware
+// and compilation options.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_lanes(void) {
+  return __builtin_spirv_subgroup_size();
+}
+
+// Returns the id of the thread inside of an wavefront executing together.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_id(void) {
+  return __builtin_spirv_subgroup_local_invocation_id();
+}
+
+// Returns the bit-mask of active threads in the current wavefront. This
+// implementation is incorrect if the target uses more than 64 lanes.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_lane_mask(void) {
+  uint32_t [[clang::ext_vector_type(4)]] __mask =
+      __builtin_spirv_subgroup_ballot(1);
+  return __builtin_bit_cast(uint64_t,
+                            __builtin_shufflevector(__mask, __mask, 0, 1));
+}
+
+// Copies the value from the first active thread in the wavefront to the rest.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t
+__gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x) {
+  return __builtin_spirv_subgroup_shuffle(__x,
+                                          __builtin_ctzg(__gpu_lane_mask()));
+}
+
+// Returns a bitmask of threads in the current lane for which \p x is true. 
This
+// implementation is incorrect if the target uses more than 64 lanes.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask,
+                                                          bool __x) {
+  // The lane_mask & gives the nvptx semantics when lane_mask is a subset of
+  // the active threads.
+  uint32_t [[clang::ext_vector_type(4)]] __mask =
+      __builtin_spirv_subgroup_ballot(__x);
+  return __lane_mask & __builtin_bit_cast(uint64_t, __builtin_shufflevector(
+                                                        __mask, __mask, 0, 1));
+}
+
+// Waits for all the threads in the block to converge and issues a fence.
+_DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_threads(void) {
+  __builtin_spirv_group_barrier();
+}
+
+// Wait for all threads in the wavefront to converge, this is a noop on SPIR-V.
+_DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) 
{
+}
+
+// Shuffles the the lanes inside the wavefront according to the given index.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t
+__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
+                      uint32_t __width) {
+  uint32_t __lane = __idx + (__gpu_lane_id() & ~(__width - 1));
+  return __builtin_spirv_subgroup_shuffle(__x, __lane);
+}
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
+  return __gpu_match_any_u32_impl(__lane_mask, __x);
+}
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
+  return __gpu_match_any_u64_impl(__lane_mask, __x);
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
+  return __gpu_match_all_u32_impl(__lane_mask, __x);
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) {
+  return __gpu_match_all_u64_impl(__lane_mask, __x);
+}
+
+// SPIR-V does not expose this, always return false.
+_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_local(void *ptr) {
+  return 0;
+}
+
+// SPIR-V does not expose this, always return false.
+_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_private(void *ptr) {
+  return 0;
+}
+
+// SPIR-V only supports 'OpTerminateInvocation' in fragment shaders.
+_DEFAULT_FN_ATTRS [[noreturn]] static __inline__ void __gpu_exit(void) {
+  __builtin_trap();
+}
+
+// This is a no-op as SPIR-V does not support it.
+_DEFAULT_FN_ATTRS static __inline__ void __gpu_thread_suspend(void) {}
+
+_Pragma("omp end declare variant");
+_Pragma("omp end declare target");
+
+#endif // __SPIRVINTRIN_H

diff  --git a/clang/test/Headers/gpuintrin.c b/clang/test/Headers/gpuintrin.c
index d9813f28f0655..c8fe721c8c37c 100644
--- a/clang/test/Headers/gpuintrin.c
+++ b/clang/test/Headers/gpuintrin.c
@@ -8,6 +8,10 @@
 // RUN:   -target-feature +ptx62 \
 // RUN:   -triple nvptx64-nvidia-cuda -emit-llvm %s -o - \
 // RUN: | FileCheck %s --check-prefix=NVPTX
+// RUN: %clang_cc1 -internal-isystem %S/Inputs/include  \
+// RUN:   -internal-isystem %S/../../lib/Headers/ \
+// RUN:   -triple spirv64-- -emit-llvm %s -o - \
+// RUN: | FileCheck %s --check-prefix=SPIRV
 
 #include <gpuintrin.h>
 
@@ -985,8 +989,436 @@ __gpu_kernel void foo() {
 // NVPTX-NEXT:    call void @llvm.nvvm.exit()
 // NVPTX-NEXT:    ret void
 //
-//.
-// AMDGPU: [[RNG3]] = !{i32 1, i32 0}
-// AMDGPU: [[META4]] = !{}
-// AMDGPU: [[RNG5]] = !{i16 1, i16 1025}
-//.
+//
+// SPIRV-LABEL: define protected spir_kernel void @foo(
+// SPIRV-SAME: ) #[[ATTR0:[0-9]+]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[CALL:%.*]] = call spir_func i32 @__gpu_num_blocks_x()
+// SPIRV-NEXT:    [[CALL1:%.*]] = call spir_func i32 @__gpu_num_blocks_y()
+// SPIRV-NEXT:    [[CALL2:%.*]] = call spir_func i32 @__gpu_num_blocks_z()
+// SPIRV-NEXT:    [[CALL3:%.*]] = call spir_func i32 @__gpu_num_blocks(i32 
noundef 0)
+// SPIRV-NEXT:    [[CALL4:%.*]] = call spir_func i32 @__gpu_block_id_x()
+// SPIRV-NEXT:    [[CALL5:%.*]] = call spir_func i32 @__gpu_block_id_y()
+// SPIRV-NEXT:    [[CALL6:%.*]] = call spir_func i32 @__gpu_block_id_z()
+// SPIRV-NEXT:    [[CALL7:%.*]] = call spir_func i32 @__gpu_block_id(i32 
noundef 0)
+// SPIRV-NEXT:    [[CALL8:%.*]] = call spir_func i32 @__gpu_num_threads_x()
+// SPIRV-NEXT:    [[CALL9:%.*]] = call spir_func i32 @__gpu_num_threads_y()
+// SPIRV-NEXT:    [[CALL10:%.*]] = call spir_func i32 @__gpu_num_threads_z()
+// SPIRV-NEXT:    [[CALL11:%.*]] = call spir_func i32 @__gpu_num_threads(i32 
noundef 0)
+// SPIRV-NEXT:    [[CALL12:%.*]] = call spir_func i32 @__gpu_thread_id_x()
+// SPIRV-NEXT:    [[CALL13:%.*]] = call spir_func i32 @__gpu_thread_id_y()
+// SPIRV-NEXT:    [[CALL14:%.*]] = call spir_func i32 @__gpu_thread_id_z()
+// SPIRV-NEXT:    [[CALL15:%.*]] = call spir_func i32 @__gpu_thread_id(i32 
noundef 0)
+// SPIRV-NEXT:    [[CALL16:%.*]] = call spir_func i32 @__gpu_num_lanes()
+// SPIRV-NEXT:    [[CALL17:%.*]] = call spir_func i32 @__gpu_lane_id()
+// SPIRV-NEXT:    [[CALL18:%.*]] = call spir_func i64 @__gpu_lane_mask()
+// SPIRV-NEXT:    [[CALL19:%.*]] = call spir_func i32 
@__gpu_read_first_lane_u32(i64 noundef -1, i32 noundef -1)
+// SPIRV-NEXT:    [[CALL20:%.*]] = call spir_func i64 
@__gpu_read_first_lane_u64(i64 noundef -1, i64 noundef -1)
+// SPIRV-NEXT:    [[CALL21:%.*]] = call spir_func i64 @__gpu_ballot(i64 
noundef -1, i1 noundef zeroext true)
+// SPIRV-NEXT:    call spir_func void @__gpu_sync_threads()
+// SPIRV-NEXT:    call spir_func void @__gpu_sync_lane(i64 noundef -1)
+// SPIRV-NEXT:    [[CALL22:%.*]] = call spir_func i32 
@__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1, i32 
noundef 0)
+// SPIRV-NEXT:    [[CALL23:%.*]] = call spir_func i64 @__gpu_first_lane_id(i64 
noundef -1)
+// SPIRV-NEXT:    [[CALL24:%.*]] = call spir_func zeroext i1 
@__gpu_is_first_in_lane(i64 noundef -1)
+// SPIRV-NEXT:    call spir_func void @__gpu_exit() #[[ATTR7:[0-9]+]]
+// SPIRV-NEXT:    unreachable
+//
+//
+// SPIRV-LABEL: define internal spir_func i32 @__gpu_num_blocks_x(
+// SPIRV-SAME: ) #[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[SPV_NUM_WORKGROUPS:%.*]] = call i64 
@llvm.spv.num.workgroups.i64(i32 0)
+// SPIRV-NEXT:    [[CONV:%.*]] = trunc i64 [[SPV_NUM_WORKGROUPS]] to i32
+// SPIRV-NEXT:    ret i32 [[CONV]]
+//
+//
+// SPIRV-LABEL: define internal spir_func i32 @__gpu_num_blocks_y(
+// SPIRV-SAME: ) #[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[SPV_NUM_WORKGROUPS:%.*]] = call i64 
@llvm.spv.num.workgroups.i64(i32 1)
+// SPIRV-NEXT:    [[CONV:%.*]] = trunc i64 [[SPV_NUM_WORKGROUPS]] to i32
+// SPIRV-NEXT:    ret i32 [[CONV]]
+//
+//
+// SPIRV-LABEL: define internal spir_func i32 @__gpu_num_blocks_z(
+// SPIRV-SAME: ) #[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[SPV_NUM_WORKGROUPS:%.*]] = call i64 
@llvm.spv.num.workgroups.i64(i32 2)
+// SPIRV-NEXT:    [[CONV:%.*]] = trunc i64 [[SPV_NUM_WORKGROUPS]] to i32
+// SPIRV-NEXT:    ret i32 [[CONV]]
+//
+//
+// SPIRV-LABEL: define internal spir_func i32 @__gpu_num_blocks(
+// SPIRV-SAME: i32 noundef [[__DIM:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[__DIM_ADDR:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    store i32 [[__DIM]], ptr [[__DIM_ADDR]], align 4
+// SPIRV-NEXT:    [[TMP0:%.*]] = load i32, ptr [[__DIM_ADDR]], align 4
+// SPIRV-NEXT:    switch i32 [[TMP0]], label %[[SW_DEFAULT:.*]] [
+// SPIRV-NEXT:      i32 0, label %[[SW_BB:.*]]
+// SPIRV-NEXT:      i32 1, label %[[SW_BB1:.*]]
+// SPIRV-NEXT:      i32 2, label %[[SW_BB3:.*]]
+// SPIRV-NEXT:    ]
+// SPIRV:       [[SW_BB]]:
+// SPIRV-NEXT:    [[CALL:%.*]] = call spir_func i32 @__gpu_num_blocks_x()
+// SPIRV-NEXT:    store i32 [[CALL]], ptr [[RETVAL]], align 4
+// SPIRV-NEXT:    br label %[[RETURN:.*]]
+// SPIRV:       [[SW_BB1]]:
+// SPIRV-NEXT:    [[CALL2:%.*]] = call spir_func i32 @__gpu_num_blocks_y()
+// SPIRV-NEXT:    store i32 [[CALL2]], ptr [[RETVAL]], align 4
+// SPIRV-NEXT:    br label %[[RETURN]]
+// SPIRV:       [[SW_BB3]]:
+// SPIRV-NEXT:    [[CALL4:%.*]] = call spir_func i32 @__gpu_num_blocks_z()
+// SPIRV-NEXT:    store i32 [[CALL4]], ptr [[RETVAL]], align 4
+// SPIRV-NEXT:    br label %[[RETURN]]
+// SPIRV:       [[SW_DEFAULT]]:
+// SPIRV-NEXT:    store i32 1, ptr [[RETVAL]], align 4
+// SPIRV-NEXT:    br label %[[RETURN]]
+// SPIRV:       [[RETURN]]:
+// SPIRV-NEXT:    [[TMP1:%.*]] = load i32, ptr [[RETVAL]], align 4
+// SPIRV-NEXT:    ret i32 [[TMP1]]
+//
+//
+// SPIRV-LABEL: define internal spir_func i32 @__gpu_block_id_x(
+// SPIRV-SAME: ) #[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[SPV_GROUP_ID:%.*]] = call i64 @llvm.spv.group.id.i64(i32 0)
+// SPIRV-NEXT:    [[CONV:%.*]] = trunc i64 [[SPV_GROUP_ID]] to i32
+// SPIRV-NEXT:    ret i32 [[CONV]]
+//
+//
+// SPIRV-LABEL: define internal spir_func i32 @__gpu_block_id_y(
+// SPIRV-SAME: ) #[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[SPV_GROUP_ID:%.*]] = call i64 @llvm.spv.group.id.i64(i32 1)
+// SPIRV-NEXT:    [[CONV:%.*]] = trunc i64 [[SPV_GROUP_ID]] to i32
+// SPIRV-NEXT:    ret i32 [[CONV]]
+//
+//
+// SPIRV-LABEL: define internal spir_func i32 @__gpu_block_id_z(
+// SPIRV-SAME: ) #[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[SPV_GROUP_ID:%.*]] = call i64 @llvm.spv.group.id.i64(i32 2)
+// SPIRV-NEXT:    [[CONV:%.*]] = trunc i64 [[SPV_GROUP_ID]] to i32
+// SPIRV-NEXT:    ret i32 [[CONV]]
+//
+//
+// SPIRV-LABEL: define internal spir_func i32 @__gpu_block_id(
+// SPIRV-SAME: i32 noundef [[__DIM:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[__DIM_ADDR:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    store i32 [[__DIM]], ptr [[__DIM_ADDR]], align 4
+// SPIRV-NEXT:    [[TMP0:%.*]] = load i32, ptr [[__DIM_ADDR]], align 4
+// SPIRV-NEXT:    switch i32 [[TMP0]], label %[[SW_DEFAULT:.*]] [
+// SPIRV-NEXT:      i32 0, label %[[SW_BB:.*]]
+// SPIRV-NEXT:      i32 1, label %[[SW_BB1:.*]]
+// SPIRV-NEXT:      i32 2, label %[[SW_BB3:.*]]
+// SPIRV-NEXT:    ]
+// SPIRV:       [[SW_BB]]:
+// SPIRV-NEXT:    [[CALL:%.*]] = call spir_func i32 @__gpu_block_id_x()
+// SPIRV-NEXT:    store i32 [[CALL]], ptr [[RETVAL]], align 4
+// SPIRV-NEXT:    br label %[[RETURN:.*]]
+// SPIRV:       [[SW_BB1]]:
+// SPIRV-NEXT:    [[CALL2:%.*]] = call spir_func i32 @__gpu_block_id_y()
+// SPIRV-NEXT:    store i32 [[CALL2]], ptr [[RETVAL]], align 4
+// SPIRV-NEXT:    br label %[[RETURN]]
+// SPIRV:       [[SW_BB3]]:
+// SPIRV-NEXT:    [[CALL4:%.*]] = call spir_func i32 @__gpu_block_id_z()
+// SPIRV-NEXT:    store i32 [[CALL4]], ptr [[RETVAL]], align 4
+// SPIRV-NEXT:    br label %[[RETURN]]
+// SPIRV:       [[SW_DEFAULT]]:
+// SPIRV-NEXT:    store i32 0, ptr [[RETVAL]], align 4
+// SPIRV-NEXT:    br label %[[RETURN]]
+// SPIRV:       [[RETURN]]:
+// SPIRV-NEXT:    [[TMP1:%.*]] = load i32, ptr [[RETVAL]], align 4
+// SPIRV-NEXT:    ret i32 [[TMP1]]
+//
+//
+// SPIRV-LABEL: define internal spir_func i32 @__gpu_num_threads_x(
+// SPIRV-SAME: ) #[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[SPV_WORKGROUP_SIZE:%.*]] = call i64 
@llvm.spv.workgroup.size.i64(i32 0)
+// SPIRV-NEXT:    [[CONV:%.*]] = trunc i64 [[SPV_WORKGROUP_SIZE]] to i32
+// SPIRV-NEXT:    ret i32 [[CONV]]
+//
+//
+// SPIRV-LABEL: define internal spir_func i32 @__gpu_num_threads_y(
+// SPIRV-SAME: ) #[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[SPV_WORKGROUP_SIZE:%.*]] = call i64 
@llvm.spv.workgroup.size.i64(i32 1)
+// SPIRV-NEXT:    [[CONV:%.*]] = trunc i64 [[SPV_WORKGROUP_SIZE]] to i32
+// SPIRV-NEXT:    ret i32 [[CONV]]
+//
+//
+// SPIRV-LABEL: define internal spir_func i32 @__gpu_num_threads_z(
+// SPIRV-SAME: ) #[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[SPV_WORKGROUP_SIZE:%.*]] = call i64 
@llvm.spv.workgroup.size.i64(i32 2)
+// SPIRV-NEXT:    [[CONV:%.*]] = trunc i64 [[SPV_WORKGROUP_SIZE]] to i32
+// SPIRV-NEXT:    ret i32 [[CONV]]
+//
+//
+// SPIRV-LABEL: define internal spir_func i32 @__gpu_num_threads(
+// SPIRV-SAME: i32 noundef [[__DIM:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[__DIM_ADDR:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    store i32 [[__DIM]], ptr [[__DIM_ADDR]], align 4
+// SPIRV-NEXT:    [[TMP0:%.*]] = load i32, ptr [[__DIM_ADDR]], align 4
+// SPIRV-NEXT:    switch i32 [[TMP0]], label %[[SW_DEFAULT:.*]] [
+// SPIRV-NEXT:      i32 0, label %[[SW_BB:.*]]
+// SPIRV-NEXT:      i32 1, label %[[SW_BB1:.*]]
+// SPIRV-NEXT:      i32 2, label %[[SW_BB3:.*]]
+// SPIRV-NEXT:    ]
+// SPIRV:       [[SW_BB]]:
+// SPIRV-NEXT:    [[CALL:%.*]] = call spir_func i32 @__gpu_num_threads_x()
+// SPIRV-NEXT:    store i32 [[CALL]], ptr [[RETVAL]], align 4
+// SPIRV-NEXT:    br label %[[RETURN:.*]]
+// SPIRV:       [[SW_BB1]]:
+// SPIRV-NEXT:    [[CALL2:%.*]] = call spir_func i32 @__gpu_num_threads_y()
+// SPIRV-NEXT:    store i32 [[CALL2]], ptr [[RETVAL]], align 4
+// SPIRV-NEXT:    br label %[[RETURN]]
+// SPIRV:       [[SW_BB3]]:
+// SPIRV-NEXT:    [[CALL4:%.*]] = call spir_func i32 @__gpu_num_threads_z()
+// SPIRV-NEXT:    store i32 [[CALL4]], ptr [[RETVAL]], align 4
+// SPIRV-NEXT:    br label %[[RETURN]]
+// SPIRV:       [[SW_DEFAULT]]:
+// SPIRV-NEXT:    store i32 1, ptr [[RETVAL]], align 4
+// SPIRV-NEXT:    br label %[[RETURN]]
+// SPIRV:       [[RETURN]]:
+// SPIRV-NEXT:    [[TMP1:%.*]] = load i32, ptr [[RETVAL]], align 4
+// SPIRV-NEXT:    ret i32 [[TMP1]]
+//
+//
+// SPIRV-LABEL: define internal spir_func i32 @__gpu_thread_id_x(
+// SPIRV-SAME: ) #[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[SPV_THREAD_ID_IN_GROUP:%.*]] = call i64 
@llvm.spv.thread.id.in.group.i64(i32 0)
+// SPIRV-NEXT:    [[CONV:%.*]] = trunc i64 [[SPV_THREAD_ID_IN_GROUP]] to i32
+// SPIRV-NEXT:    ret i32 [[CONV]]
+//
+//
+// SPIRV-LABEL: define internal spir_func i32 @__gpu_thread_id_y(
+// SPIRV-SAME: ) #[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[SPV_THREAD_ID_IN_GROUP:%.*]] = call i64 
@llvm.spv.thread.id.in.group.i64(i32 1)
+// SPIRV-NEXT:    [[CONV:%.*]] = trunc i64 [[SPV_THREAD_ID_IN_GROUP]] to i32
+// SPIRV-NEXT:    ret i32 [[CONV]]
+//
+//
+// SPIRV-LABEL: define internal spir_func i32 @__gpu_thread_id_z(
+// SPIRV-SAME: ) #[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[SPV_THREAD_ID_IN_GROUP:%.*]] = call i64 
@llvm.spv.thread.id.in.group.i64(i32 2)
+// SPIRV-NEXT:    [[CONV:%.*]] = trunc i64 [[SPV_THREAD_ID_IN_GROUP]] to i32
+// SPIRV-NEXT:    ret i32 [[CONV]]
+//
+//
+// SPIRV-LABEL: define internal spir_func i32 @__gpu_thread_id(
+// SPIRV-SAME: i32 noundef [[__DIM:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[__DIM_ADDR:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    store i32 [[__DIM]], ptr [[__DIM_ADDR]], align 4
+// SPIRV-NEXT:    [[TMP0:%.*]] = load i32, ptr [[__DIM_ADDR]], align 4
+// SPIRV-NEXT:    switch i32 [[TMP0]], label %[[SW_DEFAULT:.*]] [
+// SPIRV-NEXT:      i32 0, label %[[SW_BB:.*]]
+// SPIRV-NEXT:      i32 1, label %[[SW_BB1:.*]]
+// SPIRV-NEXT:      i32 2, label %[[SW_BB3:.*]]
+// SPIRV-NEXT:    ]
+// SPIRV:       [[SW_BB]]:
+// SPIRV-NEXT:    [[CALL:%.*]] = call spir_func i32 @__gpu_thread_id_x()
+// SPIRV-NEXT:    store i32 [[CALL]], ptr [[RETVAL]], align 4
+// SPIRV-NEXT:    br label %[[RETURN:.*]]
+// SPIRV:       [[SW_BB1]]:
+// SPIRV-NEXT:    [[CALL2:%.*]] = call spir_func i32 @__gpu_thread_id_y()
+// SPIRV-NEXT:    store i32 [[CALL2]], ptr [[RETVAL]], align 4
+// SPIRV-NEXT:    br label %[[RETURN]]
+// SPIRV:       [[SW_BB3]]:
+// SPIRV-NEXT:    [[CALL4:%.*]] = call spir_func i32 @__gpu_thread_id_z()
+// SPIRV-NEXT:    store i32 [[CALL4]], ptr [[RETVAL]], align 4
+// SPIRV-NEXT:    br label %[[RETURN]]
+// SPIRV:       [[SW_DEFAULT]]:
+// SPIRV-NEXT:    store i32 0, ptr [[RETVAL]], align 4
+// SPIRV-NEXT:    br label %[[RETURN]]
+// SPIRV:       [[RETURN]]:
+// SPIRV-NEXT:    [[TMP1:%.*]] = load i32, ptr [[RETVAL]], align 4
+// SPIRV-NEXT:    ret i32 [[TMP1]]
+//
+//
+// SPIRV-LABEL: define internal spir_func i32 @__gpu_num_lanes(
+// SPIRV-SAME: ) #[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[TMP0:%.*]] = call i32 @llvm.spv.subgroup.size()
+// SPIRV-NEXT:    ret i32 [[TMP0]]
+//
+//
+// SPIRV-LABEL: define internal spir_func i32 @__gpu_lane_id(
+// SPIRV-SAME: ) #[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[TMP0:%.*]] = call i32 
@llvm.spv.subgroup.local.invocation.id()
+// SPIRV-NEXT:    ret i32 [[TMP0]]
+//
+//
+// SPIRV-LABEL: define internal spir_func i64 @__gpu_lane_mask(
+// SPIRV-SAME: ) #[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[__MASK:%.*]] = alloca <4 x i32>, align 16
+// SPIRV-NEXT:    [[REF_TMP:%.*]] = alloca <2 x i32>, align 8
+// SPIRV-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.spv.wave.ballot(i1 true)
+// SPIRV-NEXT:    store <4 x i32> [[TMP0]], ptr [[__MASK]], align 16
+// SPIRV-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[__MASK]], align 16
+// SPIRV-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[__MASK]], align 16
+// SPIRV-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x 
i32> [[TMP2]], <2 x i32> <i32 0, i32 1>
+// SPIRV-NEXT:    store <2 x i32> [[SHUFFLE]], ptr [[REF_TMP]], align 8
+// SPIRV-NEXT:    [[TMP3:%.*]] = load i64, ptr [[REF_TMP]], align 8
+// SPIRV-NEXT:    ret i64 [[TMP3]]
+//
+//
+// SPIRV-LABEL: define internal spir_func i32 @__gpu_read_first_lane_u32(
+// SPIRV-SAME: i64 noundef [[__LANE_MASK:%.*]], i32 noundef [[__X:%.*]]) 
#[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8
+// SPIRV-NEXT:    [[__X_ADDR:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR]], align 8
+// SPIRV-NEXT:    store i32 [[__X]], ptr [[__X_ADDR]], align 4
+// SPIRV-NEXT:    [[TMP0:%.*]] = load i32, ptr [[__X_ADDR]], align 4
+// SPIRV-NEXT:    [[CALL:%.*]] = call spir_func i64 @__gpu_lane_mask()
+// SPIRV-NEXT:    [[TMP1:%.*]] = call i64 @llvm.cttz.i64(i64 [[CALL]], i1 true)
+// SPIRV-NEXT:    [[CAST:%.*]] = trunc i64 [[TMP1]] to i32
+// SPIRV-NEXT:    [[SPV_SHUFFLE:%.*]] = call i32 
@llvm.spv.wave.readlane.i32(i32 [[TMP0]], i32 [[CAST]])
+// SPIRV-NEXT:    ret i32 [[SPV_SHUFFLE]]
+//
+//
+// SPIRV-LABEL: define internal spir_func i64 @__gpu_read_first_lane_u64(
+// SPIRV-SAME: i64 noundef [[__LANE_MASK:%.*]], i64 noundef [[__X:%.*]]) 
#[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8
+// SPIRV-NEXT:    [[__X_ADDR:%.*]] = alloca i64, align 8
+// SPIRV-NEXT:    [[__HI:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[__LO:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR]], align 8
+// SPIRV-NEXT:    store i64 [[__X]], ptr [[__X_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP0:%.*]] = load i64, ptr [[__X_ADDR]], align 8
+// SPIRV-NEXT:    [[SHR:%.*]] = lshr i64 [[TMP0]], 32
+// SPIRV-NEXT:    [[CONV:%.*]] = trunc i64 [[SHR]] to i32
+// SPIRV-NEXT:    store i32 [[CONV]], ptr [[__HI]], align 4
+// SPIRV-NEXT:    [[TMP1:%.*]] = load i64, ptr [[__X_ADDR]], align 8
+// SPIRV-NEXT:    [[AND:%.*]] = and i64 [[TMP1]], 4294967295
+// SPIRV-NEXT:    [[CONV1:%.*]] = trunc i64 [[AND]] to i32
+// SPIRV-NEXT:    store i32 [[CONV1]], ptr [[__LO]], align 4
+// SPIRV-NEXT:    [[TMP2:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__HI]], align 4
+// SPIRV-NEXT:    [[CALL:%.*]] = call spir_func i32 
@__gpu_read_first_lane_u32(i64 noundef [[TMP2]], i32 noundef [[TMP3]])
+// SPIRV-NEXT:    [[CONV2:%.*]] = zext i32 [[CALL]] to i64
+// SPIRV-NEXT:    [[SHL:%.*]] = shl i64 [[CONV2]], 32
+// SPIRV-NEXT:    [[TMP4:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP5:%.*]] = load i32, ptr [[__LO]], align 4
+// SPIRV-NEXT:    [[CALL3:%.*]] = call spir_func i32 
@__gpu_read_first_lane_u32(i64 noundef [[TMP4]], i32 noundef [[TMP5]])
+// SPIRV-NEXT:    [[CONV4:%.*]] = zext i32 [[CALL3]] to i64
+// SPIRV-NEXT:    [[AND5:%.*]] = and i64 [[CONV4]], 4294967295
+// SPIRV-NEXT:    [[OR:%.*]] = or i64 [[SHL]], [[AND5]]
+// SPIRV-NEXT:    ret i64 [[OR]]
+//
+//
+// SPIRV-LABEL: define internal spir_func i64 @__gpu_ballot(
+// SPIRV-SAME: i64 noundef [[__LANE_MASK:%.*]], i1 noundef zeroext 
[[__X:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8
+// SPIRV-NEXT:    [[__X_ADDR:%.*]] = alloca i8, align 1
+// SPIRV-NEXT:    [[__MASK:%.*]] = alloca <4 x i32>, align 16
+// SPIRV-NEXT:    [[REF_TMP:%.*]] = alloca <2 x i32>, align 8
+// SPIRV-NEXT:    store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR]], align 8
+// SPIRV-NEXT:    [[STOREDV:%.*]] = zext i1 [[__X]] to i8
+// SPIRV-NEXT:    store i8 [[STOREDV]], ptr [[__X_ADDR]], align 1
+// SPIRV-NEXT:    [[TMP0:%.*]] = load i8, ptr [[__X_ADDR]], align 1
+// SPIRV-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP0]] to i1
+// SPIRV-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.spv.wave.ballot(i1 
[[LOADEDV]])
+// SPIRV-NEXT:    store <4 x i32> [[TMP1]], ptr [[__MASK]], align 16
+// SPIRV-NEXT:    [[TMP2:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr [[__MASK]], align 16
+// SPIRV-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[__MASK]], align 16
+// SPIRV-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x 
i32> [[TMP4]], <2 x i32> <i32 0, i32 1>
+// SPIRV-NEXT:    store <2 x i32> [[SHUFFLE]], ptr [[REF_TMP]], align 8
+// SPIRV-NEXT:    [[TMP5:%.*]] = load i64, ptr [[REF_TMP]], align 8
+// SPIRV-NEXT:    [[AND:%.*]] = and i64 [[TMP2]], [[TMP5]]
+// SPIRV-NEXT:    ret i64 [[AND]]
+//
+//
+// SPIRV-LABEL: define internal spir_func void @__gpu_sync_threads(
+// SPIRV-SAME: ) #[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    call void @llvm.spv.group.memory.barrier.with.group.sync()
+// SPIRV-NEXT:    ret void
+//
+//
+// SPIRV-LABEL: define internal spir_func void @__gpu_sync_lane(
+// SPIRV-SAME: i64 noundef [[__LANE_MASK:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8
+// SPIRV-NEXT:    store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR]], align 8
+// SPIRV-NEXT:    ret void
+//
+//
+// SPIRV-LABEL: define internal spir_func i32 @__gpu_shuffle_idx_u32(
+// SPIRV-SAME: i64 noundef [[__LANE_MASK:%.*]], i32 noundef [[__IDX:%.*]], i32 
noundef [[__X:%.*]], i32 noundef [[__WIDTH:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8
+// SPIRV-NEXT:    [[__IDX_ADDR:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[__X_ADDR:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[__WIDTH_ADDR:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[__LANE:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR]], align 8
+// SPIRV-NEXT:    store i32 [[__IDX]], ptr [[__IDX_ADDR]], align 4
+// SPIRV-NEXT:    store i32 [[__X]], ptr [[__X_ADDR]], align 4
+// SPIRV-NEXT:    store i32 [[__WIDTH]], ptr [[__WIDTH_ADDR]], align 4
+// SPIRV-NEXT:    [[TMP0:%.*]] = load i32, ptr [[__IDX_ADDR]], align 4
+// SPIRV-NEXT:    [[CALL:%.*]] = call spir_func i32 @__gpu_lane_id()
+// SPIRV-NEXT:    [[TMP1:%.*]] = load i32, ptr [[__WIDTH_ADDR]], align 4
+// SPIRV-NEXT:    [[SUB:%.*]] = sub i32 [[TMP1]], 1
+// SPIRV-NEXT:    [[NOT:%.*]] = xor i32 [[SUB]], -1
+// SPIRV-NEXT:    [[AND:%.*]] = and i32 [[CALL]], [[NOT]]
+// SPIRV-NEXT:    [[ADD:%.*]] = add i32 [[TMP0]], [[AND]]
+// SPIRV-NEXT:    store i32 [[ADD]], ptr [[__LANE]], align 4
+// SPIRV-NEXT:    [[TMP2:%.*]] = load i32, ptr [[__X_ADDR]], align 4
+// SPIRV-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__LANE]], align 4
+// SPIRV-NEXT:    [[SPV_SHUFFLE:%.*]] = call i32 
@llvm.spv.wave.readlane.i32(i32 [[TMP2]], i32 [[TMP3]])
+// SPIRV-NEXT:    ret i32 [[SPV_SHUFFLE]]
+//
+//
+// SPIRV-LABEL: define internal spir_func i64 @__gpu_first_lane_id(
+// SPIRV-SAME: i64 noundef [[__LANE_MASK:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8
+// SPIRV-NEXT:    store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP0:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP1:%.*]] = call i64 @llvm.cttz.i64(i64 [[TMP0]], i1 true)
+// SPIRV-NEXT:    [[TMP2:%.*]] = add i64 [[TMP1]], 1
+// SPIRV-NEXT:    [[ISZERO:%.*]] = icmp eq i64 [[TMP0]], 0
+// SPIRV-NEXT:    [[FFS:%.*]] = select i1 [[ISZERO]], i64 0, i64 [[TMP2]]
+// SPIRV-NEXT:    [[CAST:%.*]] = trunc i64 [[FFS]] to i32
+// SPIRV-NEXT:    [[SUB:%.*]] = sub nsw i32 [[CAST]], 1
+// SPIRV-NEXT:    [[CONV:%.*]] = sext i32 [[SUB]] to i64
+// SPIRV-NEXT:    ret i64 [[CONV]]
+//
+//
+// SPIRV-LABEL: define internal spir_func zeroext i1 @__gpu_is_first_in_lane(
+// SPIRV-SAME: i64 noundef [[__LANE_MASK:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8
+// SPIRV-NEXT:    store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR]], align 8
+// SPIRV-NEXT:    [[CALL:%.*]] = call spir_func i32 @__gpu_lane_id()
+// SPIRV-NEXT:    [[CONV:%.*]] = zext i32 [[CALL]] to i64
+// SPIRV-NEXT:    [[TMP0:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8
+// SPIRV-NEXT:    [[CALL1:%.*]] = call spir_func i64 @__gpu_first_lane_id(i64 
noundef [[TMP0]])
+// SPIRV-NEXT:    [[CMP:%.*]] = icmp eq i64 [[CONV]], [[CALL1]]
+// SPIRV-NEXT:    ret i1 [[CMP]]
+//
+//
+// SPIRV-LABEL: define internal spir_func void @__gpu_exit(
+// SPIRV-SAME: ) #[[ATTR1:[0-9]+]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    call void @llvm.trap()
+// SPIRV-NEXT:    ret void

diff  --git a/clang/test/Headers/gpuintrin_lang.c 
b/clang/test/Headers/gpuintrin_lang.c
index 653f87aea2ce3..e3db72d5ff928 100644
--- a/clang/test/Headers/gpuintrin_lang.c
+++ b/clang/test/Headers/gpuintrin_lang.c
@@ -22,6 +22,11 @@
 // RUN:   -fopenmp-is-target-device -triple amdgcn -emit-llvm %s -o - \
 // RUN: | FileCheck %s --check-prefix=OPENMP
 //
+// RUN: %clang_cc1 -internal-isystem %S/Inputs/include -DSYCL \
+// RUN:   -internal-isystem %S/../../lib/Headers/ -fsycl-is-device \
+// RUN:   -x c++ -triple spirv64 -emit-llvm %s -o - \
+// RUN: | FileCheck %s --check-prefix=SYCL
+//
 // RUN: %clang_cc1 -internal-isystem %S/Inputs/include \
 // RUN:   -std=c89 -internal-isystem %S/../../lib/Headers/ \
 // RUN:   -triple amdgcn-amd-amdhsa -emit-llvm %s -o - \
@@ -32,11 +37,13 @@
 
 #ifdef __device__
 __device__ int foo() { return __gpu_thread_id_x(); }
+#elif defined(SYCL)
+extern "C" [[clang::sycl_external]] int foo() { return __gpu_thread_id_x(); }
 #else
 // CUDA-LABEL: define dso_local i32 @foo(
 // CUDA-SAME: ) #[[ATTR0:[0-9]+]] {
 // CUDA-NEXT:  [[ENTRY:.*:]]
-// CUDA-NEXT:    [[TMP0:%.*]] = call {{.*}}i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+// CUDA-NEXT:    [[TMP0:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
 // CUDA-NEXT:    ret i32 [[TMP0]]
 //
 // HIP-LABEL: define dso_local i32 @foo(
@@ -61,6 +68,17 @@ __device__ int foo() { return __gpu_thread_id_x(); }
 // OPENMP-NEXT:    [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
 // OPENMP-NEXT:    ret i32 [[TMP0]]
 //
+// SYCL-LABEL: define spir_func i32 @foo(
+// SYCL-SAME: ) #[[ATTR0:[0-9]+]] {
+// SYCL-NEXT:  [[ENTRY:.*:]]
+// SYCL-NEXT:    [[RETVAL_I:%.*]] = alloca i32, align 4
+// SYCL-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// SYCL-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr [[RETVAL]] to ptr 
addrspace(4)
+// SYCL-NEXT:    [[RETVAL_ASCAST_I:%.*]] = addrspacecast ptr [[RETVAL_I]] to 
ptr addrspace(4)
+// SYCL-NEXT:    [[SPV_THREAD_ID_IN_GROUP_I:%.*]] = call i64 
@llvm.spv.thread.id.in.group.i64(i32 0)
+// SYCL-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SPV_THREAD_ID_IN_GROUP_I]] to i32
+// SYCL-NEXT:    ret i32 [[CONV_I]]
+//
 // C89-LABEL: define dso_local i32 @foo(
 // C89-SAME: ) #[[ATTR0:[0-9]+]] {
 // C89-NEXT:  [[ENTRY:.*:]]


        
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to