llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-backend-x86 Author: Joseph Huber (jhuber6) <details> <summary>Changes</summary> Summary: https://github.com/llvm/llvm-project/pull/174862 and https://github.com/llvm/llvm-project/pull/174655 provided the intrinsics required to get the fundamental operations working for these. This patch sets up the basic support (as far as I know). This should be the first step towards allowing SPIR-V to build things like the LLVM libc and the OpenMP Device Runtime Library. The implementations here are intentionally inefficient, such as not using the dedicated SPIR-V opcode for read firstlane. This is just to start and hopefully start testing things later. Would appreciate someone more familiar with the backend double-checking these. --- Full diff: https://github.com/llvm/llvm-project/pull/174910.diff 4 Files Affected: - (modified) clang/lib/Headers/CMakeLists.txt (+1) - (modified) clang/lib/Headers/gpuintrin.h (+2) - (added) clang/lib/Headers/spirvintrin.h (+171) - (modified) clang/test/Headers/gpuintrin_lang.c (+19-1) ``````````diff diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt index 1b96ac417bf70..c92b370b88d2d 100644 --- a/clang/lib/Headers/CMakeLists.txt +++ b/clang/lib/Headers/CMakeLists.txt @@ -297,6 +297,7 @@ set(gpu_files gpuintrin.h nvptxintrin.h amdgpuintrin.h + spirvintrin.h ) set(windows_only_files diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h index 7afc82413996b..30f3667adea73 100644 --- a/clang/lib/Headers/gpuintrin.h +++ b/clang/lib/Headers/gpuintrin.h @@ -60,6 +60,8 @@ _Pragma("omp end declare target"); #include <nvptxintrin.h> #elif defined(__AMDGPU__) #include <amdgpuintrin.h> +#elif defined(__SPIRV__) +#include <spirvintrin.h> #elif !defined(_OPENMP) #error "This header is only meant to be used on GPU architectures." #endif diff --git a/clang/lib/Headers/spirvintrin.h b/clang/lib/Headers/spirvintrin.h new file mode 100644 index 0000000000000..bf5df70583dc6 --- /dev/null +++ b/clang/lib/Headers/spirvintrin.h @@ -0,0 +1,171 @@ +//===-- spirvintrin.h - SPIR-V intrinsic functions ------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __SPIRVINTRIN_H +#define __SPIRVINTRIN_H + +#ifndef __SPIRV__ +#error "This file is intended for SPIR-V targets or offloading to SPIR-V" +#endif + +#ifndef __GPUINTRIN_H +#error "Never use <spirvintrin.h> directly; include <gpuintrin.h> instead" +#endif + +_Pragma("omp begin declare target device_type(nohost)"); +_Pragma("omp begin declare variant match(device = {arch(spirv64)})"); + +// Type aliases to the address spaces used by the SPIR-V backend. +#define __gpu_private __attribute__((address_space(0))) +#define __gpu_constant __attribute__((address_space(2))) +#define __gpu_local __attribute__((address_space(3))) +#define __gpu_global __attribute__((address_space(1))) +#define __gpu_generic __attribute__((address_space(4))) + +// Attribute to declare a function as a kernel. +#define __gpu_kernel __attribute__((device_kernel, visibility("protected"))) + +// Returns the number of workgroups in the 'x' dimension of the grid. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) { + return __builtin_spirv_num_workgroups(0); +} + +// Returns the number of workgroups in the 'y' dimension of the grid. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_y(void) { + return __builtin_spirv_num_workgroups(1); +} + +// Returns the number of workgroups in the 'z' dimension of the grid. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_z(void) { + return __builtin_spirv_num_workgroups(2); +} + +// Returns the 'x' dimension of the current workgroup's id. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_x(void) { + return __builtin_spirv_workgroup_id(0); +} + +// Returns the 'y' dimension of the current workgroup's id. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_y(void) { + return __builtin_spirv_workgroup_id(1); +} + +// Returns the 'z' dimension of the current workgroup's id. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_z(void) { + return __builtin_spirv_workgroup_id(2); +} + +// Returns the number of workitems in the 'x' dimension. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_x(void) { + return __builtin_spirv_workgroup_size(0); +} + +// Returns the number of workitems in the 'y' dimension. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_y(void) { + return __builtin_spirv_workgroup_size(1); +} + +// Returns the number of workitems in the 'z' dimension. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_z(void) { + return __builtin_spirv_workgroup_size(2); +} + +// Returns the 'x' dimension id of the workitem in the current workgroup. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_x(void) { + return __builtin_spirv_local_invocation_id(0); +} + +// Returns the 'y' dimension id of the workitem in the current workgroup. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_y(void) { + return __builtin_spirv_local_invocation_id(1); +} + +// Returns the 'z' dimension id of the workitem in the current workgroup. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_z(void) { + return __builtin_spirv_local_invocation_id(2); +} + +// Returns the size of an wavefront, either 32 or 64 depending on hardware +// and compilation options. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_lanes(void) { + return __builtin_spirv_subgroup_size(); +} + +// Returns the id of the thread inside of an wavefront executing together. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_id(void) { + return __builtin_spirv_subgroup_id(); +} + +// Returns the bit-mask of active threads in the current wavefront. This +// implementation is incorrect if the target uses more than 64 lanes. +_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_lane_mask(void) { + uint32_t [[clang::ext_vector_type(4)]] __mask = + __builtin_spirv_subgroup_ballot(1); + return __builtin_bit_cast(uint64_t, + __builtin_shufflevector(__mask, __mask, 0, 1)); +} + +// Copies the value from the first active thread in the wavefront to the rest. +_DEFAULT_FN_ATTRS static __inline__ uint32_t +__gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x) { + return __builtin_spirv_subgroup_shuffle(__x, + __builtin_ctzg(__gpu_lane_mask())); +} + +// Returns a bitmask of threads in the current lane for which \p x is true. This +// implementation is incorrect if the target uses more than 64 lanes. +_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask, + bool __x) { + // The lane_mask & gives the nvptx semantics when lane_mask is a subset of + // the active threads. + uint32_t [[clang::ext_vector_type(4)]] __mask = + __builtin_spirv_subgroup_ballot(__x); + return __lane_mask & __builtin_bit_cast(uint64_t, __builtin_shufflevector( + __mask, __mask, 0, 1)); +} + +// Wait for all threads in the wavefront to converge, this is a noop on SPIR-V. +_DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) { +} + +// Shuffles the the lanes inside the wavefront according to the given index. +_DEFAULT_FN_ATTRS static __inline__ uint32_t +__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x, + uint32_t __width) { + uint32_t __lane = __idx + (__gpu_lane_id() & ~(__width - 1)); + return __builtin_spirv_subgroup_shuffle(__x, __lane); +} + +// Returns a bitmask marking all lanes that have the same value of __x. +_DEFAULT_FN_ATTRS static __inline__ uint64_t +__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) { + return __gpu_match_any_u32_impl(__lane_mask, __x); +} + +// Returns a bitmask marking all lanes that have the same value of __x. +_DEFAULT_FN_ATTRS static __inline__ uint64_t +__gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) { + return __gpu_match_any_u64_impl(__lane_mask, __x); +} + +// Returns the current lane mask if every lane contains __x. +_DEFAULT_FN_ATTRS static __inline__ uint64_t +__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) { + return __gpu_match_all_u32_impl(__lane_mask, __x); +} + +// Returns the current lane mask if every lane contains __x. +_DEFAULT_FN_ATTRS static __inline__ uint64_t +__gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) { + return __gpu_match_all_u64_impl(__lane_mask, __x); +} + +_Pragma("omp end declare variant"); +_Pragma("omp end declare target"); + +#endif // __SPIRVINTRIN_H diff --git a/clang/test/Headers/gpuintrin_lang.c b/clang/test/Headers/gpuintrin_lang.c index 653f87aea2ce3..e3db72d5ff928 100644 --- a/clang/test/Headers/gpuintrin_lang.c +++ b/clang/test/Headers/gpuintrin_lang.c @@ -22,6 +22,11 @@ // RUN: -fopenmp-is-target-device -triple amdgcn -emit-llvm %s -o - \ // RUN: | FileCheck %s --check-prefix=OPENMP // +// RUN: %clang_cc1 -internal-isystem %S/Inputs/include -DSYCL \ +// RUN: -internal-isystem %S/../../lib/Headers/ -fsycl-is-device \ +// RUN: -x c++ -triple spirv64 -emit-llvm %s -o - \ +// RUN: | FileCheck %s --check-prefix=SYCL +// // RUN: %clang_cc1 -internal-isystem %S/Inputs/include \ // RUN: -std=c89 -internal-isystem %S/../../lib/Headers/ \ // RUN: -triple amdgcn-amd-amdhsa -emit-llvm %s -o - \ @@ -32,11 +37,13 @@ #ifdef __device__ __device__ int foo() { return __gpu_thread_id_x(); } +#elif defined(SYCL) +extern "C" [[clang::sycl_external]] int foo() { return __gpu_thread_id_x(); } #else // CUDA-LABEL: define dso_local i32 @foo( // CUDA-SAME: ) #[[ATTR0:[0-9]+]] { // CUDA-NEXT: [[ENTRY:.*:]] -// CUDA-NEXT: [[TMP0:%.*]] = call {{.*}}i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CUDA-NEXT: [[TMP0:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() // CUDA-NEXT: ret i32 [[TMP0]] // // HIP-LABEL: define dso_local i32 @foo( @@ -61,6 +68,17 @@ __device__ int foo() { return __gpu_thread_id_x(); } // OPENMP-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() // OPENMP-NEXT: ret i32 [[TMP0]] // +// SYCL-LABEL: define spir_func i32 @foo( +// SYCL-SAME: ) #[[ATTR0:[0-9]+]] { +// SYCL-NEXT: [[ENTRY:.*:]] +// SYCL-NEXT: [[RETVAL_I:%.*]] = alloca i32, align 4 +// SYCL-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 +// SYCL-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr [[RETVAL]] to ptr addrspace(4) +// SYCL-NEXT: [[RETVAL_ASCAST_I:%.*]] = addrspacecast ptr [[RETVAL_I]] to ptr addrspace(4) +// SYCL-NEXT: [[SPV_THREAD_ID_IN_GROUP_I:%.*]] = call i64 @llvm.spv.thread.id.in.group.i64(i32 0) +// SYCL-NEXT: [[CONV_I:%.*]] = trunc i64 [[SPV_THREAD_ID_IN_GROUP_I]] to i32 +// SYCL-NEXT: ret i32 [[CONV_I]] +// // C89-LABEL: define dso_local i32 @foo( // C89-SAME: ) #[[ATTR0:[0-9]+]] { // C89-NEXT: [[ENTRY:.*:]] `````````` </details> https://github.com/llvm/llvm-project/pull/174910 _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
