Author: Rana Pratap Reddy Date: 2026-01-06T16:30:29+05:30 New Revision: ba1b867cea3ed91d228b1468a8ba21213f57ea4a
URL: https://github.com/llvm/llvm-project/commit/ba1b867cea3ed91d228b1468a8ba21213f57ea4a DIFF: https://github.com/llvm/llvm-project/commit/ba1b867cea3ed91d228b1468a8ba21213f57ea4a.diff LOG: [AMDGPU] Modifies fdot2 builtin def to take _Float16 for HIP/C++ (#174527) For dl `__builtin_amdgcn_fdot2` builtins, using 'x' in the def so that it will take _Float16 for HIP/C++ and half for OpenCL. Added: clang/test/CodeGenHIP/builtins-amdgcn-dl-insts.hip clang/test/CodeGenHIP/builtins-amdgcn-gfx11-dl-insts.hip Modified: clang/include/clang/Basic/BuiltinsAMDGPU.def Removed: ################################################################################ diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 1a78374e0a64b..24b79c3b69b67 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -292,8 +292,8 @@ TARGET_BUILTIN(__builtin_amdgcn_global_load_lds, "vv*1v*3IUiIiIUi", "", "vmem-to // Deep learning builtins. //===----------------------------------------------------------------------===// -TARGET_BUILTIN(__builtin_amdgcn_fdot2, "fV2hV2hfIb", "nc", "dot10-insts") -TARGET_BUILTIN(__builtin_amdgcn_fdot2_f16_f16, "hV2hV2hh", "nc", "dot9-insts") +TARGET_BUILTIN(__builtin_amdgcn_fdot2, "fV2xV2xfIb", "nc", "dot10-insts") +TARGET_BUILTIN(__builtin_amdgcn_fdot2_f16_f16, "xV2xV2xx", "nc", "dot9-insts") TARGET_BUILTIN(__builtin_amdgcn_fdot2_bf16_bf16, "sV2sV2ss", "nc", "dot9-insts") TARGET_BUILTIN(__builtin_amdgcn_fdot2_f32_bf16, "fV2sV2sfIb", "nc", "dot12-insts") TARGET_BUILTIN(__builtin_amdgcn_sdot2, "SiV2SsV2SsSiIb", "nc", "dot2-insts") diff --git a/clang/test/CodeGenHIP/builtins-amdgcn-dl-insts.hip b/clang/test/CodeGenHIP/builtins-amdgcn-dl-insts.hip new file mode 100644 index 0000000000000..d43022dc9a1bb --- /dev/null +++ b/clang/test/CodeGenHIP/builtins-amdgcn-dl-insts.hip @@ -0,0 +1,265 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6 +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx906 -emit-llvm -fcuda-is-device -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1011 -emit-llvm -fcuda-is-device -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1012 -emit-llvm -fcuda-is-device -o - %s | FileCheck %s + +#define __device__ __attribute__((device)) + +typedef unsigned int uint; +typedef _Float16 __attribute__((ext_vector_type(2))) half2; +typedef short __attribute__((ext_vector_type(2))) short2; +typedef unsigned short __attribute__((ext_vector_type(2))) ushort2; + +// CHECK-LABEL: define dso_local void @_Z21test_amdgcn_fdot2_hipPfDv2_DF16_S0_f( +// CHECK-SAME: ptr noundef [[FOUT:%.*]], <2 x half> noundef [[V2HA:%.*]], <2 x half> noundef [[V2HB:%.*]], float noundef [[FC:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[FOUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[V2HA_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5) +// CHECK-NEXT: [[V2HB_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5) +// CHECK-NEXT: [[FC_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[FOUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[FOUT_ADDR]] to ptr +// CHECK-NEXT: [[V2HA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V2HA_ADDR]] to ptr +// CHECK-NEXT: [[V2HB_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V2HB_ADDR]] to ptr +// CHECK-NEXT: [[FC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[FC_ADDR]] to ptr +// CHECK-NEXT: store ptr [[FOUT]], ptr [[FOUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x half> [[V2HA]], ptr [[V2HA_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store <2 x half> [[V2HB]], ptr [[V2HB_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store float [[FC]], ptr [[FC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr [[V2HA_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[V2HB_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[FC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = call contract float @llvm.amdgcn.fdot2(<2 x half> [[TMP0]], <2 x half> [[TMP1]], float [[TMP2]], i1 false) +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[FOUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i64 0 +// CHECK-NEXT: store float [[TMP3]], ptr [[ARRAYIDX]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = load <2 x half>, ptr [[V2HA_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP6:%.*]] = load <2 x half>, ptr [[V2HB_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[FC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = call contract float @llvm.amdgcn.fdot2(<2 x half> [[TMP5]], <2 x half> [[TMP6]], float [[TMP7]], i1 true) +// CHECK-NEXT: [[TMP9:%.*]] = load ptr, ptr [[FOUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 1 +// CHECK-NEXT: store float [[TMP8]], ptr [[ARRAYIDX1]], align 4 +// CHECK-NEXT: ret void +// +__device__ void test_amdgcn_fdot2_hip(float* fOut, half2 v2hA, half2 v2hB, float fC) { + fOut[0] = __builtin_amdgcn_fdot2(v2hA, v2hB, fC, false); + fOut[1] = __builtin_amdgcn_fdot2(v2hA, v2hB, fC, true); +} + +// CHECK-LABEL: define dso_local void @_Z21test_amdgcn_sdot2_hipPiDv2_sS0_i( +// CHECK-SAME: ptr noundef [[SIOUT:%.*]], <2 x i16> noundef [[V2SSA:%.*]], <2 x i16> noundef [[V2SSB:%.*]], i32 noundef [[SIC:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SIOUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[V2SSA_ADDR:%.*]] = alloca <2 x i16>, align 4, addrspace(5) +// CHECK-NEXT: [[V2SSB_ADDR:%.*]] = alloca <2 x i16>, align 4, addrspace(5) +// CHECK-NEXT: [[SIC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SIOUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SIOUT_ADDR]] to ptr +// CHECK-NEXT: [[V2SSA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V2SSA_ADDR]] to ptr +// CHECK-NEXT: [[V2SSB_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V2SSB_ADDR]] to ptr +// CHECK-NEXT: [[SIC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SIC_ADDR]] to ptr +// CHECK-NEXT: store ptr [[SIOUT]], ptr [[SIOUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x i16> [[V2SSA]], ptr [[V2SSA_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store <2 x i16> [[V2SSB]], ptr [[V2SSB_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[SIC]], ptr [[SIC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i16>, ptr [[V2SSA_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr [[V2SSB_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[SIC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.sdot2(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]], i32 [[TMP2]], i1 false) +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[SIOUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 0 +// CHECK-NEXT: store i32 [[TMP3]], ptr [[ARRAYIDX]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = load <2 x i16>, ptr [[V2SSA_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP6:%.*]] = load <2 x i16>, ptr [[V2SSB_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[SIC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.sdot2(<2 x i16> [[TMP5]], <2 x i16> [[TMP6]], i32 [[TMP7]], i1 true) +// CHECK-NEXT: [[TMP9:%.*]] = load ptr, ptr [[SIOUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 1 +// CHECK-NEXT: store i32 [[TMP8]], ptr [[ARRAYIDX1]], align 4 +// CHECK-NEXT: ret void +// +__device__ void test_amdgcn_sdot2_hip(int* siOut, short2 v2ssA, short2 v2ssB, int siC) { + siOut[0] = __builtin_amdgcn_sdot2(v2ssA, v2ssB, siC, false); + siOut[1] = __builtin_amdgcn_sdot2(v2ssA, v2ssB, siC, true); +} + +// CHECK-LABEL: define dso_local void @_Z21test_amdgcn_udot2_hipPjDv2_tS0_j( +// CHECK-SAME: ptr noundef [[UIOUT:%.*]], <2 x i16> noundef [[V2USA:%.*]], <2 x i16> noundef [[V2USB:%.*]], i32 noundef [[UIC:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[UIOUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[V2USA_ADDR:%.*]] = alloca <2 x i16>, align 4, addrspace(5) +// CHECK-NEXT: [[V2USB_ADDR:%.*]] = alloca <2 x i16>, align 4, addrspace(5) +// CHECK-NEXT: [[UIC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[UIOUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[UIOUT_ADDR]] to ptr +// CHECK-NEXT: [[V2USA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V2USA_ADDR]] to ptr +// CHECK-NEXT: [[V2USB_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V2USB_ADDR]] to ptr +// CHECK-NEXT: [[UIC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[UIC_ADDR]] to ptr +// CHECK-NEXT: store ptr [[UIOUT]], ptr [[UIOUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x i16> [[V2USA]], ptr [[V2USA_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store <2 x i16> [[V2USB]], ptr [[V2USB_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[UIC]], ptr [[UIC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i16>, ptr [[V2USA_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr [[V2USB_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[UIC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.udot2(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]], i32 [[TMP2]], i1 false) +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[UIOUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 0 +// CHECK-NEXT: store i32 [[TMP3]], ptr [[ARRAYIDX]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = load <2 x i16>, ptr [[V2USA_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP6:%.*]] = load <2 x i16>, ptr [[V2USB_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[UIC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.udot2(<2 x i16> [[TMP5]], <2 x i16> [[TMP6]], i32 [[TMP7]], i1 true) +// CHECK-NEXT: [[TMP9:%.*]] = load ptr, ptr [[UIOUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 1 +// CHECK-NEXT: store i32 [[TMP8]], ptr [[ARRAYIDX1]], align 4 +// CHECK-NEXT: ret void +// +__device__ void test_amdgcn_udot2_hip(uint* uiOut, ushort2 v2usA, ushort2 v2usB, uint uiC) { + uiOut[0] = __builtin_amdgcn_udot2(v2usA, v2usB, uiC, false); + uiOut[1] = __builtin_amdgcn_udot2(v2usA, v2usB, uiC, true); +} + +// CHECK-LABEL: define dso_local void @_Z21test_amdgcn_sdot4_hipPiiii( +// CHECK-SAME: ptr noundef [[SIOUT:%.*]], i32 noundef [[SIA:%.*]], i32 noundef [[SIB:%.*]], i32 noundef [[SIC:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SIOUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[SIA_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SIB_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SIC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SIOUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SIOUT_ADDR]] to ptr +// CHECK-NEXT: [[SIA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SIA_ADDR]] to ptr +// CHECK-NEXT: [[SIB_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SIB_ADDR]] to ptr +// CHECK-NEXT: [[SIC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SIC_ADDR]] to ptr +// CHECK-NEXT: store ptr [[SIOUT]], ptr [[SIOUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[SIA]], ptr [[SIA_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[SIB]], ptr [[SIB_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[SIC]], ptr [[SIC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SIA_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SIB_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[SIC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.sdot4(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i1 false) +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[SIOUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 0 +// CHECK-NEXT: store i32 [[TMP3]], ptr [[ARRAYIDX]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[SIA_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[SIB_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[SIC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.sdot4(i32 [[TMP5]], i32 [[TMP6]], i32 [[TMP7]], i1 true) +// CHECK-NEXT: [[TMP9:%.*]] = load ptr, ptr [[SIOUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 1 +// CHECK-NEXT: store i32 [[TMP8]], ptr [[ARRAYIDX1]], align 4 +// CHECK-NEXT: ret void +// +__device__ void test_amdgcn_sdot4_hip(int* siOut, int siA, int siB, int siC) { + siOut[0] = __builtin_amdgcn_sdot4(siA, siB, siC, false); + siOut[1] = __builtin_amdgcn_sdot4(siA, siB, siC, true); +} + +// CHECK-LABEL: define dso_local void @_Z21test_amdgcn_udot4_hipPjjjj( +// CHECK-SAME: ptr noundef [[UIOUT:%.*]], i32 noundef [[UIA:%.*]], i32 noundef [[UIB:%.*]], i32 noundef [[UIC:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[UIOUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[UIA_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[UIB_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[UIC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[UIOUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[UIOUT_ADDR]] to ptr +// CHECK-NEXT: [[UIA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[UIA_ADDR]] to ptr +// CHECK-NEXT: [[UIB_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[UIB_ADDR]] to ptr +// CHECK-NEXT: [[UIC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[UIC_ADDR]] to ptr +// CHECK-NEXT: store ptr [[UIOUT]], ptr [[UIOUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[UIA]], ptr [[UIA_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[UIB]], ptr [[UIB_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[UIC]], ptr [[UIC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[UIA_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[UIB_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[UIC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.udot4(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i1 false) +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[UIOUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 0 +// CHECK-NEXT: store i32 [[TMP3]], ptr [[ARRAYIDX]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[UIA_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[UIB_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[UIC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.udot4(i32 [[TMP5]], i32 [[TMP6]], i32 [[TMP7]], i1 true) +// CHECK-NEXT: [[TMP9:%.*]] = load ptr, ptr [[UIOUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 1 +// CHECK-NEXT: store i32 [[TMP8]], ptr [[ARRAYIDX1]], align 4 +// CHECK-NEXT: ret void +// +__device__ void test_amdgcn_udot4_hip(uint* uiOut, uint uiA, uint uiB, uint uiC) { + uiOut[0] = __builtin_amdgcn_udot4(uiA, uiB, uiC, false); + uiOut[1] = __builtin_amdgcn_udot4(uiA, uiB, uiC, true); +} + +// CHECK-LABEL: define dso_local void @_Z21test_amdgcn_sdot8_hipPiiii( +// CHECK-SAME: ptr noundef [[SIOUT:%.*]], i32 noundef [[SIA:%.*]], i32 noundef [[SIB:%.*]], i32 noundef [[SIC:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SIOUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[SIA_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SIB_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SIC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SIOUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SIOUT_ADDR]] to ptr +// CHECK-NEXT: [[SIA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SIA_ADDR]] to ptr +// CHECK-NEXT: [[SIB_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SIB_ADDR]] to ptr +// CHECK-NEXT: [[SIC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SIC_ADDR]] to ptr +// CHECK-NEXT: store ptr [[SIOUT]], ptr [[SIOUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[SIA]], ptr [[SIA_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[SIB]], ptr [[SIB_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[SIC]], ptr [[SIC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SIA_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SIB_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[SIC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.sdot8(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i1 false) +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[SIOUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 0 +// CHECK-NEXT: store i32 [[TMP3]], ptr [[ARRAYIDX]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[SIA_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[SIB_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[SIC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.sdot8(i32 [[TMP5]], i32 [[TMP6]], i32 [[TMP7]], i1 true) +// CHECK-NEXT: [[TMP9:%.*]] = load ptr, ptr [[SIOUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 1 +// CHECK-NEXT: store i32 [[TMP8]], ptr [[ARRAYIDX1]], align 4 +// CHECK-NEXT: ret void +// +__device__ void test_amdgcn_sdot8_hip(int* siOut, int siA, int siB, int siC) { + siOut[0] = __builtin_amdgcn_sdot8(siA, siB, siC, false); + siOut[1] = __builtin_amdgcn_sdot8(siA, siB, siC, true); +} + +// CHECK-LABEL: define dso_local void @_Z21test_amdgcn_udot8_hipPjjjj( +// CHECK-SAME: ptr noundef [[UIOUT:%.*]], i32 noundef [[UIA:%.*]], i32 noundef [[UIB:%.*]], i32 noundef [[UIC:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[UIOUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[UIA_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[UIB_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[UIC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[UIOUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[UIOUT_ADDR]] to ptr +// CHECK-NEXT: [[UIA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[UIA_ADDR]] to ptr +// CHECK-NEXT: [[UIB_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[UIB_ADDR]] to ptr +// CHECK-NEXT: [[UIC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[UIC_ADDR]] to ptr +// CHECK-NEXT: store ptr [[UIOUT]], ptr [[UIOUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[UIA]], ptr [[UIA_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[UIB]], ptr [[UIB_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[UIC]], ptr [[UIC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[UIA_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[UIB_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[UIC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.udot8(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i1 false) +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[UIOUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 0 +// CHECK-NEXT: store i32 [[TMP3]], ptr [[ARRAYIDX]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[UIA_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[UIB_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[UIC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.udot8(i32 [[TMP5]], i32 [[TMP6]], i32 [[TMP7]], i1 true) +// CHECK-NEXT: [[TMP9:%.*]] = load ptr, ptr [[UIOUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 1 +// CHECK-NEXT: store i32 [[TMP8]], ptr [[ARRAYIDX1]], align 4 +// CHECK-NEXT: ret void +// +__device__ void test_amdgcn_udot8_hip(uint* uiOut, uint uiA, uint uiB, uint uiC) { + uiOut[0] = __builtin_amdgcn_udot8(uiA, uiB, uiC, false); + uiOut[1] = __builtin_amdgcn_udot8(uiA, uiB, uiC, true); +} + diff --git a/clang/test/CodeGenHIP/builtins-amdgcn-gfx11-dl-insts.hip b/clang/test/CodeGenHIP/builtins-amdgcn-gfx11-dl-insts.hip new file mode 100644 index 0000000000000..09988486c8815 --- /dev/null +++ b/clang/test/CodeGenHIP/builtins-amdgcn-gfx11-dl-insts.hip @@ -0,0 +1,35 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6 +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1100 -emit-llvm -fcuda-is-device -o - %s | FileCheck %s + +#define __device__ __attribute__((device)) + +typedef _Float16 __attribute__((ext_vector_type(2))) half2; + +// CHECK-LABEL: define dso_local void @_Z29test_amdgcn_fdot2_f16_f16_hipPDF16_Dv2_DF16_S0_DF16_( +// CHECK-SAME: ptr noundef [[OUT:%.*]], <2 x half> noundef [[A:%.*]], <2 x half> noundef [[B:%.*]], half noundef [[C:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5) +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5) +// CHECK-NEXT: [[C_ADDR:%.*]] = alloca half, align 2, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// CHECK-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// CHECK-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x half> [[A]], ptr [[A_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store <2 x half> [[B]], ptr [[B_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store half [[C]], ptr [[C_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr [[A_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[B_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load half, ptr [[C_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP3:%.*]] = call contract half @llvm.amdgcn.fdot2.f16.f16(<2 x half> [[TMP0]], <2 x half> [[TMP1]], half [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store half [[TMP3]], ptr [[TMP4]], align 2 +// CHECK-NEXT: ret void +// +__device__ void test_amdgcn_fdot2_f16_f16_hip(_Float16* out, half2 a, half2 b, _Float16 c) { + *out = __builtin_amdgcn_fdot2_f16_f16(a, b, c); +} + _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
