llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-clang Author: Rana Pratap Reddy (ranapratap55) <details> <summary>Changes</summary> For raytrace and wmma builtins, using 'x' in the def to take _Float16 for HIP/C++ and half for OpenCL. --- Patch is 39.74 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/175039.diff 6 Files Affected: - (modified) clang/include/clang/Basic/BuiltinsAMDGPU.def (+12-12) - (added) clang/test/CodeGenHIP/builtins-amdgcn-gfx12-wmma-w32.hip (+62) - (added) clang/test/CodeGenHIP/builtins-amdgcn-gfx12-wmma-w64.hip (+62) - (added) clang/test/CodeGenHIP/builtins-amdgcn-raytracing.hip (+96) - (added) clang/test/CodeGenHIP/builtins-amdgcn-wmma-w32.hip (+89) - (added) clang/test/CodeGenHIP/builtins-amdgcn-wmma-w64.hip (+90) ``````````diff diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index bb823704c84c8..f189e34aac707 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -325,9 +325,9 @@ TARGET_BUILTIN(__builtin_amdgcn_s_ttracedata_imm, "vIs", "n", "gfx10-insts") // Postfix h indicates the 4/5-th arguments are half4. //===----------------------------------------------------------------------===// TARGET_BUILTIN(__builtin_amdgcn_image_bvh_intersect_ray, "V4UiUifV4fV4fV4fV4Ui", "nc", "gfx10-insts") -TARGET_BUILTIN(__builtin_amdgcn_image_bvh_intersect_ray_h, "V4UiUifV4fV4hV4hV4Ui", "nc", "gfx10-insts") +TARGET_BUILTIN(__builtin_amdgcn_image_bvh_intersect_ray_h, "V4UiUifV4fV4xV4xV4Ui", "nc", "gfx10-insts") TARGET_BUILTIN(__builtin_amdgcn_image_bvh_intersect_ray_l, "V4UiWUifV4fV4fV4fV4Ui", "nc", "gfx10-insts") -TARGET_BUILTIN(__builtin_amdgcn_image_bvh_intersect_ray_lh, "V4UiWUifV4fV4hV4hV4Ui", "nc", "gfx10-insts") +TARGET_BUILTIN(__builtin_amdgcn_image_bvh_intersect_ray_lh, "V4UiWUifV4fV4xV4xV4Ui", "nc", "gfx10-insts") //===----------------------------------------------------------------------===// @@ -343,20 +343,20 @@ TARGET_BUILTIN(__builtin_amdgcn_s_wait_event_export_ready, "v", "n", "gfx11-inst // Postfix w32 indicates the builtin requires wavefront size of 32. // Postfix w64 indicates the builtin requires wavefront size of 64. //===----------------------------------------------------------------------===// -TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w32, "V8fV16hV16hV8f", "nc", "gfx11-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w32, "V8fV16xV16xV8f", "nc", "gfx11-insts,wavefrontsize32") TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32, "V8fV16sV16sV8f", "nc", "gfx11-insts,wavefrontsize32") -TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w32, "V16hV16hV16hV16hIb", "nc", "gfx11-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w32, "V16xV16xV16xV16xIb", "nc", "gfx11-insts,wavefrontsize32") TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32, "V16sV16sV16sV16sIb", "nc", "gfx11-insts,wavefrontsize32") -TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32, "V16hV16hV16hV16hIb", "nc", "gfx11-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32, "V16xV16xV16xV16xIb", "nc", "gfx11-insts,wavefrontsize32") TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32, "V16sV16sV16sV16sIb", "nc", "gfx11-insts,wavefrontsize32") TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32, "V8iIbV4iIbV4iV8iIb", "nc", "gfx11-insts,wavefrontsize32") TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32, "V8iIbV2iIbV2iV8iIb", "nc", "gfx11-insts,wavefrontsize32") -TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w64, "V4fV16hV16hV4f", "nc", "gfx11-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w64, "V4fV16xV16xV4f", "nc", "gfx11-insts,wavefrontsize64") TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64, "V4fV16sV16sV4f", "nc", "gfx11-insts,wavefrontsize64") -TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w64, "V8hV16hV16hV8hIb", "nc", "gfx11-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w64, "V8xV16xV16xV8xIb", "nc", "gfx11-insts,wavefrontsize64") TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64, "V8sV16sV16sV8sIb", "nc", "gfx11-insts,wavefrontsize64") -TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64, "V8hV16hV16hV8hIb", "nc", "gfx11-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64, "V8xV16xV16xV8xIb", "nc", "gfx11-insts,wavefrontsize64") TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64, "V8sV16sV16sV8sIb", "nc", "gfx11-insts,wavefrontsize64") TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64, "V4iIbV4iIbV4iV4iIb", "nc", "gfx11-insts,wavefrontsize64") TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64, "V4iIbV2iIbV2iV4iIb", "nc", "gfx11-insts,wavefrontsize64") @@ -590,9 +590,9 @@ TARGET_BUILTIN(__builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn, "V2WUiUiUiV8UiIi", // Therefore, we add an "_gfx12" suffix to distinguish them from the existing // builtins. //===----------------------------------------------------------------------===// -TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12, "V8fV8hV8hV8f", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12, "V8fV8xV8xV8f", "nc", "gfx12-insts,wavefrontsize32") TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12, "V8fV8sV8sV8f", "nc", "gfx12-insts,wavefrontsize32") -TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12, "V8hV8hV8hV8h", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12, "V8xV8xV8xV8x", "nc", "gfx12-insts,wavefrontsize32") TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12, "V8sV8sV8sV8s", "nc", "gfx12-insts,wavefrontsize32") TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12, "V8iIbV2iIbV2iV8iIb", "nc", "gfx12-insts,wavefrontsize32") TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12, "V8iIbiIbiV8iIb", "nc", "gfx12-insts,wavefrontsize32") @@ -604,9 +604,9 @@ TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12, "V8fV2iV2iV TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12, "V8fV2iV2iV8f", "nc", "gfx12-insts,wavefrontsize32") TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12, "V8iIbV2iIbV2iV8iIb", "nc", "gfx12-insts,wavefrontsize32") -TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12, "V4fV4hV4hV4f", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12, "V4fV4xV4xV4f", "nc", "gfx12-insts,wavefrontsize64") TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12, "V4fV4sV4sV4f", "nc", "gfx12-insts,wavefrontsize64") -TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12, "V4hV4hV4hV4h", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12, "V4xV4xV4xV4x", "nc", "gfx12-insts,wavefrontsize64") TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12, "V4sV4sV4sV4s", "nc", "gfx12-insts,wavefrontsize64") TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12, "V4iIbiIbiV4iIb", "nc", "gfx12-insts,wavefrontsize64") TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12, "V4iIbiIbiV4iIb", "nc", "gfx12-insts,wavefrontsize64") diff --git a/clang/test/CodeGenHIP/builtins-amdgcn-gfx12-wmma-w32.hip b/clang/test/CodeGenHIP/builtins-amdgcn-gfx12-wmma-w32.hip new file mode 100644 index 0000000000000..6e4ec6bf8c107 --- /dev/null +++ b/clang/test/CodeGenHIP/builtins-amdgcn-gfx12-wmma-w32.hip @@ -0,0 +1,62 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6 +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 -emit-llvm -fcuda-is-device -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200 + +#define __device__ __attribute__((device)) + +typedef float v8f __attribute__((ext_vector_type(8))); +typedef _Float16 v8h __attribute__((ext_vector_type(8))); + +// CHECK-GFX1200-LABEL: define dso_local void @_Z47test_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12_hipPDv8_fDv8_DF16_S1_S_( +// CHECK-GFX1200-SAME: ptr noundef [[OUT:%.*]], <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-GFX1200-NEXT: [[ENTRY:.*:]] +// CHECK-GFX1200-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-GFX1200-NEXT: [[A_ADDR:%.*]] = alloca <8 x half>, align 16, addrspace(5) +// CHECK-GFX1200-NEXT: [[B_ADDR:%.*]] = alloca <8 x half>, align 16, addrspace(5) +// CHECK-GFX1200-NEXT: [[C_ADDR:%.*]] = alloca <8 x float>, align 32, addrspace(5) +// CHECK-GFX1200-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-GFX1200-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-GFX1200-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// CHECK-GFX1200-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// CHECK-GFX1200-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-GFX1200-NEXT: store <8 x half> [[A]], ptr [[A_ADDR_ASCAST]], align 16 +// CHECK-GFX1200-NEXT: store <8 x half> [[B]], ptr [[B_ADDR_ASCAST]], align 16 +// CHECK-GFX1200-NEXT: store <8 x float> [[C]], ptr [[C_ADDR_ASCAST]], align 32 +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR_ASCAST]], align 16 +// CHECK-GFX1200-NEXT: [[TMP1:%.*]] = load <8 x half>, ptr [[B_ADDR_ASCAST]], align 16 +// CHECK-GFX1200-NEXT: [[TMP2:%.*]] = load <8 x float>, ptr [[C_ADDR_ASCAST]], align 32 +// CHECK-GFX1200-NEXT: [[TMP3:%.*]] = call contract <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x float> [[TMP2]]) +// CHECK-GFX1200-NEXT: [[TMP4:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-GFX1200-NEXT: store <8 x float> [[TMP3]], ptr [[TMP4]], align 32 +// CHECK-GFX1200-NEXT: ret void +// +__device__ void test_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12_hip(v8f* out, v8h a, v8h b, v8f c) { + *out = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12(a, b, c); +} + +// CHECK-GFX1200-LABEL: define dso_local void @_Z47test_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12_hipPDv8_DF16_S_S_S_( +// CHECK-GFX1200-SAME: ptr noundef [[OUT:%.*]], <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-GFX1200-NEXT: [[ENTRY:.*:]] +// CHECK-GFX1200-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-GFX1200-NEXT: [[A_ADDR:%.*]] = alloca <8 x half>, align 16, addrspace(5) +// CHECK-GFX1200-NEXT: [[B_ADDR:%.*]] = alloca <8 x half>, align 16, addrspace(5) +// CHECK-GFX1200-NEXT: [[C_ADDR:%.*]] = alloca <8 x half>, align 16, addrspace(5) +// CHECK-GFX1200-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-GFX1200-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-GFX1200-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// CHECK-GFX1200-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// CHECK-GFX1200-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-GFX1200-NEXT: store <8 x half> [[A]], ptr [[A_ADDR_ASCAST]], align 16 +// CHECK-GFX1200-NEXT: store <8 x half> [[B]], ptr [[B_ADDR_ASCAST]], align 16 +// CHECK-GFX1200-NEXT: store <8 x half> [[C]], ptr [[C_ADDR_ASCAST]], align 16 +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR_ASCAST]], align 16 +// CHECK-GFX1200-NEXT: [[TMP1:%.*]] = load <8 x half>, ptr [[B_ADDR_ASCAST]], align 16 +// CHECK-GFX1200-NEXT: [[TMP2:%.*]] = load <8 x half>, ptr [[C_ADDR_ASCAST]], align 16 +// CHECK-GFX1200-NEXT: [[TMP3:%.*]] = call contract <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], i1 false) +// CHECK-GFX1200-NEXT: [[TMP4:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-GFX1200-NEXT: store <8 x half> [[TMP3]], ptr [[TMP4]], align 16 +// CHECK-GFX1200-NEXT: ret void +// +__device__ void test_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12_hip(v8h* out, v8h a, v8h b, v8h c) { + *out = __builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12(a, b, c); +} diff --git a/clang/test/CodeGenHIP/builtins-amdgcn-gfx12-wmma-w64.hip b/clang/test/CodeGenHIP/builtins-amdgcn-gfx12-wmma-w64.hip new file mode 100644 index 0000000000000..21bae28f85e8a --- /dev/null +++ b/clang/test/CodeGenHIP/builtins-amdgcn-gfx12-wmma-w64.hip @@ -0,0 +1,62 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6 +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 -target-feature +wavefrontsize64 -emit-llvm -fcuda-is-device -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200 + +#define __device__ __attribute__((device)) + +typedef float v4f __attribute__((ext_vector_type(4))); +typedef _Float16 v4h __attribute__((ext_vector_type(4))); + +// CHECK-GFX1200-LABEL: define dso_local void @_Z47test_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12_hipPDv4_fDv4_DF16_S1_S_( +// CHECK-GFX1200-SAME: ptr noundef [[OUT:%.*]], <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-GFX1200-NEXT: [[ENTRY:.*:]] +// CHECK-GFX1200-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-GFX1200-NEXT: [[A_ADDR:%.*]] = alloca <4 x half>, align 8, addrspace(5) +// CHECK-GFX1200-NEXT: [[B_ADDR:%.*]] = alloca <4 x half>, align 8, addrspace(5) +// CHECK-GFX1200-NEXT: [[C_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-GFX1200-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-GFX1200-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-GFX1200-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// CHECK-GFX1200-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// CHECK-GFX1200-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-GFX1200-NEXT: store <4 x half> [[A]], ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-GFX1200-NEXT: store <4 x half> [[B]], ptr [[B_ADDR_ASCAST]], align 8 +// CHECK-GFX1200-NEXT: store <4 x float> [[C]], ptr [[C_ADDR_ASCAST]], align 16 +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = load <4 x half>, ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-GFX1200-NEXT: [[TMP1:%.*]] = load <4 x half>, ptr [[B_ADDR_ASCAST]], align 8 +// CHECK-GFX1200-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[C_ADDR_ASCAST]], align 16 +// CHECK-GFX1200-NEXT: [[TMP3:%.*]] = call contract <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> [[TMP0]], <4 x half> [[TMP1]], <4 x float> [[TMP2]]) +// CHECK-GFX1200-NEXT: [[TMP4:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-GFX1200-NEXT: store <4 x float> [[TMP3]], ptr [[TMP4]], align 16 +// CHECK-GFX1200-NEXT: ret void +// +__device__ void test_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12_hip(v4f* out, v4h a, v4h b, v4f c) { + *out = __builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12(a, b, c); +} + +// CHECK-GFX1200-LABEL: define dso_local void @_Z47test_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12_hipPDv4_DF16_S_S_S_( +// CHECK-GFX1200-SAME: ptr noundef [[OUT:%.*]], <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-GFX1200-NEXT: [[ENTRY:.*:]] +// CHECK-GFX1200-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-GFX1200-NEXT: [[A_ADDR:%.*]] = alloca <4 x half>, align 8, addrspace(5) +// CHECK-GFX1200-NEXT: [[B_ADDR:%.*]] = alloca <4 x half>, align 8, addrspace(5) +// CHECK-GFX1200-NEXT: [[C_ADDR:%.*]] = alloca <4 x half>, align 8, addrspace(5) +// CHECK-GFX1200-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-GFX1200-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-GFX1200-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// CHECK-GFX1200-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// CHECK-GFX1200-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-GFX1200-NEXT: store <4 x half> [[A]], ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-GFX1200-NEXT: store <4 x half> [[B]], ptr [[B_ADDR_ASCAST]], align 8 +// CHECK-GFX1200-NEXT: store <4 x half> [[C]], ptr [[C_ADDR_ASCAST]], align 8 +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = load <4 x half>, ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-GFX1200-NEXT: [[TMP1:%.*]] = load <4 x half>, ptr [[B_ADDR_ASCAST]], align 8 +// CHECK-GFX1200-NEXT: [[TMP2:%.*]] = load <4 x half>, ptr [[C_ADDR_ASCAST]], align 8 +// CHECK-GFX1200-NEXT: [[TMP3:%.*]] = call contract <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16(<4 x half> [[TMP0]], <4 x half> [[TMP1]], <4 x half> [[TMP2]], i1 false) +// CHECK-GFX1200-NEXT: [[TMP4:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-GFX1200-NEXT: store <4 x half> [[TMP3]], ptr [[TMP4]], align 8 +// CHECK-GFX1200-NEXT: ret void +// +__device__ void test_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12_hip(v4h* out, v4h a, v4h b, v4h c) { + *out = __builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12(a, b, c); +} diff --git a/clang/test/CodeGenHIP/builtins-amdgcn-raytracing.hip b/clang/test/CodeGenHIP/builtins-amdgcn-raytracing.hip new file mode 100644 index 0000000000000..1f3c65201da30 --- /dev/null +++ b/clang/test/CodeGenHIP/builtins-amdgcn-raytracing.hip @@ -0,0 +1,96 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6 +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1030 -emit-llvm -fcuda-is-device -o - %s | FileCheck %s --check-prefix=CHECK-GFX1030 + +#define __device__ __attribute__((device)) + +typedef unsigned int v4ui __attribute__((ext_vector_type(4))); +typedef float v4f __attribute__((ext_vector_type(4))); +typedef _Float16 v4h __attribute__((ext_vector_type(4))); +typedef unsigned long ulong; + +// CHECK-GFX1030-LABEL: define dso_local void @_Z34test_image_bvh_intersect_ray_h_hipPDv4_jjfDv4_fDv4_DF16_S2_S_( +// CHECK-GFX1030-SAME: ptr noundef [[OUT:%.*]], i32 noundef [[NODE:%.*]], float noundef [[TMAX:%.*]], <4 x float> noundef [[ORIGIN:%.*]], <4 x half> noundef [[DIR:%.*]], <4 x half> noundef [[INV_DIR:%.*]], <4 x i32> noundef [[EXT:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-GFX1030-NEXT: [[ENTRY:.*:]] +// CHECK-GFX1030-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-GFX1030-NEXT: [[NODE_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-GFX1030-NEXT: [[TMAX_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-GFX1030-NEXT: [[ORIGIN_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-GFX1030-NEXT: [[DIR_ADDR:%.*]] = alloca <4 x half>, align 8, addrspace(5) +// CHECK-GFX1030-NEXT: [[INV_DIR_ADDR:%.*]] = alloca <4 x half>, align 8, addrspace(5) +// CHECK-GFX1030-NEXT: [[EXT_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5) +// CHECK-GFX1030-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-GFX1030-NEXT: [[NODE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[NODE_ADDR]] to ptr +// CHECK-GFX1030-NEXT: [[TMAX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMAX_ADDR]] to ptr +// CHECK-GFX1030-NEXT: [[ORIGIN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ORIGIN_ADDR]] to ptr +// CHECK-GFX1030-NEXT: [[DIR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DIR_ADDR]] to ptr +// CHECK-GFX1030-NEXT: [[INV_DIR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[INV_DIR_ADDR]] to ptr +// CHECK-GFX1030-NEXT: [[EXT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[EXT_ADDR]] to ptr +// CHECK-GFX1030-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-GFX1030-NEXT: store i32 [[NODE]], ptr [[NODE_ADDR_ASCAST]], align 4 +// CHECK-GFX1030-NEXT: store float [[TMAX]], ptr [[TMAX_ADDR_ASCAST]], align 4 +// CHECK-GFX1030-NEXT: store <4 x float> [[ORIGIN]], ptr [[ORIGIN_ADDR_ASCAST]], align 16 +// CHECK-GFX1030-NEXT: store <4 x half> [[DIR]], ptr [[DIR_ADDR_ASCAST]], align 8 +// CHECK-GFX1030-NEXT: store <4 x half> [[INV_DIR]], ptr [[INV_DIR_ADDR_ASCAST]], align 8 +// CHECK-GFX1030-NEXT: store <4 x i32> [[EXT]], ptr [[EXT_ADDR_ASCAST]], align 16 +// CHECK-GFX1030-NEXT: [[TMP0:%.*]] = load i32, ptr [[NODE_ADDR_ASCAST]], align 4 +// CHECK-GFX1030-NEXT: [[TMP1:%.*]] = load float, ptr [[TMAX_ADDR_ASCAST]], align 4 +// CHECK-GFX1030-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr ... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/175039 _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
