https://github.com/ranapratap55 created https://github.com/llvm/llvm-project/pull/174707
For cvt and atomic `__builtin_amdgcn_cvt` builtins, using 'x' in the def to take _Float16 for HIP/C++ and half for OpenCL. >From 74e7ffdebd0417fed2a65e516afc882522608c0f Mon Sep 17 00:00:00 2001 From: ranapratap55 <[email protected]> Date: Wed, 7 Jan 2026 12:01:48 +0530 Subject: [PATCH] [AMDGPU] Modifies cvt and atomic builtin def to take _Float16 for HIP/C++ --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 44 +- .../CodeGenHIP/builtins-amdgcn-gfx950-cvt.hip | 419 ++++++++++++++++++ 2 files changed, 441 insertions(+), 22 deletions(-) create mode 100644 clang/test/CodeGenHIP/builtins-amdgcn-gfx950-cvt.hip diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 24b79c3b69b67..5d1841d8cd987 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -180,7 +180,7 @@ BUILTIN(__builtin_amdgcn_raw_buffer_load_b128, "V4UiQbiiIi", "n") BUILTIN(__builtin_amdgcn_raw_ptr_buffer_atomic_add_i32, "iiQbiiIi", "") TARGET_BUILTIN(__builtin_amdgcn_raw_ptr_buffer_atomic_fadd_f32, "ffQbiiIi", "", "atomic-fadd-rtn-insts") -TARGET_BUILTIN(__builtin_amdgcn_raw_ptr_buffer_atomic_fadd_v2f16, "V2hV2hQbiiIi", "", "atomic-buffer-global-pk-add-f16-insts") +TARGET_BUILTIN(__builtin_amdgcn_raw_ptr_buffer_atomic_fadd_v2f16, "V2xV2xQbiiIi", "", "atomic-buffer-global-pk-add-f16-insts") TARGET_BUILTIN(__builtin_amdgcn_raw_ptr_buffer_atomic_fmin_f32, "ffQbiiIi", "", "atomic-fmin-fmax-global-f32") TARGET_BUILTIN(__builtin_amdgcn_raw_ptr_buffer_atomic_fmax_f32, "ffQbiiIi", "", "atomic-fmin-fmax-global-f32") @@ -268,7 +268,7 @@ TARGET_BUILTIN(__builtin_amdgcn_fmed3h, "hhhh", "nc", "gfx9-insts") TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_f64, "dd*1d", "", "gfx90a-insts") TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_f32, "ff*1f", "", "atomic-fadd-rtn-insts") -TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_v2f16, "V2hV2h*1V2h", "t", "atomic-buffer-global-pk-add-f16-insts") +TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_v2f16, "V2xV2x*1V2x", "t", "atomic-buffer-global-pk-add-f16-insts") TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fmin_f64, "dd*1d", "", "gfx90a-insts") TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fmax_f64, "dd*1d", "", "gfx90a-insts") @@ -280,11 +280,11 @@ TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_f64, "dd*3d", "", "gfx90a-insts") TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_f32, "ff*3f", "", "gfx8-insts") TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fadd_f32, "ff*0f", "", "gfx940-insts") -TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fadd_v2f16, "V2hV2h*0V2h", "t", "atomic-flat-pk-add-16-insts") +TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fadd_v2f16, "V2xV2x*0V2x", "t", "atomic-flat-pk-add-16-insts") TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fadd_v2bf16, "V2sV2s*0V2s", "t", "atomic-flat-pk-add-16-insts") TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_v2bf16, "V2sV2s*1V2s", "t", "atomic-global-pk-add-bf16-inst") TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2bf16, "V2sV2s*3V2s", "t", "atomic-ds-pk-add-16-insts") -TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2f16, "V2hV2h*3V2h", "t", "atomic-ds-pk-add-16-insts") +TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2f16, "V2xV2x*3V2x", "t", "atomic-ds-pk-add-16-insts") TARGET_BUILTIN(__builtin_amdgcn_load_to_lds, "vv*v*3IUiIiIUi", "", "vmem-to-lds-load-insts") TARGET_BUILTIN(__builtin_amdgcn_global_load_lds, "vv*1v*3IUiIiIUi", "", "vmem-to-lds-load-insts") @@ -643,59 +643,59 @@ TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64, "V4fiV2iV4fi", TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64, "V4fiV2iV4fi", "nc", "gfx12-insts,wavefrontsize64") TARGET_BUILTIN(__builtin_amdgcn_prng_b32, "UiUi", "nc", "prng-inst") -TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk32_fp6_f16, "V6UiV32hf", "nc", "f16bf16-to-fp6bf6-cvt-scale-insts") -TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk32_bf6_f16, "V6UiV32hf", "nc", "f16bf16-to-fp6bf6-cvt-scale-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk32_fp6_f16, "V6UiV32xf", "nc", "f16bf16-to-fp6bf6-cvt-scale-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk32_bf6_f16, "V6UiV32xf", "nc", "f16bf16-to-fp6bf6-cvt-scale-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk32_fp6_bf16, "V6UiV32yf", "nc", "f16bf16-to-fp6bf6-cvt-scale-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk32_bf6_bf16, "V6UiV32yf", "nc", "f16bf16-to-fp6bf6-cvt-scale-insts") -TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_f16_fp8, "V2hV2hifIiIb", "nc", "fp8-cvt-scale-insts") -TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_f16_bf8, "V2hV2hifIiIb", "nc", "bf8-cvt-scale-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_f16_fp8, "V2xV2xifIiIb", "nc", "fp8-cvt-scale-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_f16_bf8, "V2xV2xifIiIb", "nc", "bf8-cvt-scale-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_f32_fp8, "fifIi", "nc", "fp8-cvt-scale-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_f32_bf8, "fifIi", "nc", "bf8-cvt-scale-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk_fp8_f32, "V2sV2sfffIb", "nc", "fp8-cvt-scale-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk_bf8_f32, "V2sV2sfffIb", "nc", "bf8-cvt-scale-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk_f32_fp8, "V2fUifIb", "nc", "fp8-cvt-scale-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk_f32_bf8, "V2fUifIb", "nc", "bf8-cvt-scale-insts") -TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk_fp8_f16, "V2sV2sV2hfIb", "nc", "fp8-cvt-scale-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk_fp8_f16, "V2sV2sV2xfIb", "nc", "fp8-cvt-scale-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk_fp8_bf16, "V2sV2sV2yfIb", "nc", "fp8-cvt-scale-insts") -TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk_bf8_f16, "V2sV2sV2hfIb", "nc", "bf8-cvt-scale-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk_bf8_f16, "V2sV2sV2xfIb", "nc", "bf8-cvt-scale-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk_bf8_bf16, "V2sV2sV2yfIb", "nc", "bf8-cvt-scale-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk_f32_fp4, "V2fUifIi", "nc", "fp4-cvt-scale-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk_fp4_f32, "UiUifffIi", "nc", "fp4-cvt-scale-insts") -TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk_f16_fp4, "V2hUifIi", "nc", "fp4-cvt-scale-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk_f16_fp4, "V2xUifIi", "nc", "fp4-cvt-scale-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk_bf16_fp4, "V2yUifIi", "nc", "fp4-cvt-scale-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk32_f32_fp6, "V32fV6Uif", "nc", "fp6bf6-cvt-scale-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk32_f32_bf6, "V32fV6Uif", "nc", "fp6bf6-cvt-scale-insts") -TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk32_f16_fp6, "V32hV6Uif", "nc", "fp6bf6-cvt-scale-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk32_f16_fp6, "V32xV6Uif", "nc", "fp6bf6-cvt-scale-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk32_bf16_fp6, "V32yV6Uif", "nc", "fp6bf6-cvt-scale-insts") -TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk32_f16_bf6, "V32hV6Uif", "nc", "fp6bf6-cvt-scale-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk32_f16_bf6, "V32xV6Uif", "nc", "fp6bf6-cvt-scale-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk32_bf16_bf6, "V32yV6Uif", "nc", "fp6bf6-cvt-scale-insts") -TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk_f16_fp8, "V2hUifIb", "nc", "fp8-cvt-scale-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk_f16_fp8, "V2xUifIb", "nc", "fp8-cvt-scale-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk_bf16_fp8, "V2yUifIb", "nc", "fp8-cvt-scale-insts") -TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk_f16_bf8, "V2hUifIb", "nc", "bf8-cvt-scale-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk_f16_bf8, "V2xUifIb", "nc", "bf8-cvt-scale-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk_bf16_bf8, "V2yUifIb", "nc", "bf8-cvt-scale-insts") -TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk_fp4_f16, "UiUiV2hfIi", "nc", "fp4-cvt-scale-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk_fp4_f16, "UiUiV2xfIi", "nc", "fp4-cvt-scale-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk_fp4_bf16, "UiUiV2yfIi", "nc", "fp4-cvt-scale-insts") -TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f16, "UiUiV2hUifIi", "nc", "fp4-cvt-scale-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f16, "UiUiV2xUifIi", "nc", "fp4-cvt-scale-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk_fp4_bf16, "UiUiV2yUifIi", "nc", "fp4-cvt-scale-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f32, "UiUiV2fUifIi", "nc", "fp4-cvt-scale-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_bf8_bf16, "iiyUifIi", "nc", "bf8-cvt-scale-insts") -TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_bf8_f16, "iihUifIi", "nc", "bf8-cvt-scale-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_bf8_f16, "iixUifIi", "nc", "bf8-cvt-scale-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_bf8_f32, "iifUifIi", "nc", "bf8-cvt-scale-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_fp8_bf16, "iiyUifIi", "nc", "fp8-cvt-scale-insts") -TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_fp8_f16, "iihUifIi", "nc", "fp8-cvt-scale-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_fp8_f16, "iixUifIi", "nc", "fp8-cvt-scale-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_fp8_f32, "iifUifIi", "nc", "fp8-cvt-scale-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_bf16, "V6UiV32yUif", "nc", "f16bf16-to-fp6bf6-cvt-scale-insts") -TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_f16, "V6UiV32hUif", "nc", "f16bf16-to-fp6bf6-cvt-scale-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_f16, "V6UiV32xUif", "nc", "f16bf16-to-fp6bf6-cvt-scale-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_f32, "V6UiV32fUif", "nc", "f16bf16-to-fp6bf6-cvt-scale-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_bf16, "V6UiV32yUif", "nc", "f16bf16-to-fp6bf6-cvt-scale-insts") -TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_f16, "V6UiV32hUif", "nc", "f16bf16-to-fp6bf6-cvt-scale-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_f16, "V6UiV32xUif", "nc", "f16bf16-to-fp6bf6-cvt-scale-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_f32, "V6UiV32fUif", "nc", "f16bf16-to-fp6bf6-cvt-scale-insts") TARGET_BUILTIN(__builtin_amdgcn_bitop3_b32, "iiiiIUi", "nc", "bitop3-insts") TARGET_BUILTIN(__builtin_amdgcn_bitop3_b16, "ssssIUi", "nc", "bitop3-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_bf16_f32, "V2yV2yfUiIb", "nc", "f32-to-f16bf16-cvt-sr-insts") -TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_f16_f32, "V2hV2hfUiIb", "nc", "f32-to-f16bf16-cvt-sr-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_f16_f32, "V2xV2xfUiIb", "nc", "f32-to-f16bf16-cvt-sr-insts") //===----------------------------------------------------------------------===// // GFX1250+ only builtins. diff --git a/clang/test/CodeGenHIP/builtins-amdgcn-gfx950-cvt.hip b/clang/test/CodeGenHIP/builtins-amdgcn-gfx950-cvt.hip new file mode 100644 index 0000000000000..4158b4633c3d9 --- /dev/null +++ b/clang/test/CodeGenHIP/builtins-amdgcn-gfx950-cvt.hip @@ -0,0 +1,419 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6 +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx950 -emit-llvm -fcuda-is-device -o - %s | FileCheck %s + +#define __device__ __attribute__((device)) + +typedef _Float16 v2h __attribute__((ext_vector_type(2))); +typedef _Float16 v32h __attribute__((ext_vector_type(32))); +typedef unsigned int v6ui __attribute__((ext_vector_type(6))); +typedef short v2s __attribute__((ext_vector_type(2))); + +// CHECK-LABEL: define dso_local void @_Z34test_cvt_scalef32_pk32_fp6_f16_hipPDv6_jDv32_DF16_f( +// CHECK-SAME: ptr noundef [[OUT:%.*]], <32 x half> noundef [[A:%.*]], float noundef [[SCALE:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <32 x half>, align 64, addrspace(5) +// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr +// CHECK-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <32 x half> [[A]], ptr [[A_ADDR_ASCAST]], align 64 +// CHECK-NEXT: store float [[SCALE]], ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR_ASCAST]], align 64 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.fp6.f16(<32 x half> [[TMP0]], float [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <6 x i32> [[TMP2]], ptr [[TMP3]], align 32 +// CHECK-NEXT: ret void +// +__device__ void test_cvt_scalef32_pk32_fp6_f16_hip(v6ui* out, v32h a, float scale) { + *out = __builtin_amdgcn_cvt_scalef32_pk32_fp6_f16(a, scale); +} + +// CHECK-LABEL: define dso_local void @_Z34test_cvt_scalef32_pk32_bf6_f16_hipPDv6_jDv32_DF16_f( +// CHECK-SAME: ptr noundef [[OUT:%.*]], <32 x half> noundef [[A:%.*]], float noundef [[SCALE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <32 x half>, align 64, addrspace(5) +// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr +// CHECK-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <32 x half> [[A]], ptr [[A_ADDR_ASCAST]], align 64 +// CHECK-NEXT: store float [[SCALE]], ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR_ASCAST]], align 64 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.bf6.f16(<32 x half> [[TMP0]], float [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <6 x i32> [[TMP2]], ptr [[TMP3]], align 32 +// CHECK-NEXT: ret void +// +__device__ void test_cvt_scalef32_pk32_bf6_f16_hip(v6ui* out, v32h a, float scale) { + *out = __builtin_amdgcn_cvt_scalef32_pk32_bf6_f16(a, scale); +} + +// CHECK-LABEL: define dso_local void @_Z29test_cvt_scalef32_f16_fp8_hipPDv2_DF16_S_if( +// CHECK-SAME: ptr noundef [[OUT:%.*]], <2 x half> noundef [[SRC:%.*]], i32 noundef [[SRC32:%.*]], float noundef [[SCALE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5) +// CHECK-NEXT: [[SRC32_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr +// CHECK-NEXT: [[SRC32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC32_ADDR]] to ptr +// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr +// CHECK-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x half> [[SRC]], ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[SRC32]], ptr [[SRC32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store float [[SCALE]], ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SRC32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = call contract <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> [[TMP0]], i32 [[TMP1]], float [[TMP2]], i32 0, i1 false) +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x half> [[TMP3]], ptr [[TMP4]], align 4 +// CHECK-NEXT: ret void +// +__device__ void test_cvt_scalef32_f16_fp8_hip(v2h* out, v2h src, int src32, float scale) { + *out = __builtin_amdgcn_cvt_scalef32_f16_fp8(src, src32, scale, 0, false); +} + +// CHECK-LABEL: define dso_local void @_Z29test_cvt_scalef32_f16_bf8_hipPDv2_DF16_S_if( +// CHECK-SAME: ptr noundef [[OUT:%.*]], <2 x half> noundef [[SRC:%.*]], i32 noundef [[SRC32:%.*]], float noundef [[SCALE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5) +// CHECK-NEXT: [[SRC32_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr +// CHECK-NEXT: [[SRC32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC32_ADDR]] to ptr +// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr +// CHECK-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x half> [[SRC]], ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[SRC32]], ptr [[SRC32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store float [[SCALE]], ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SRC32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = call contract <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> [[TMP0]], i32 [[TMP1]], float [[TMP2]], i32 0, i1 false) +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x half> [[TMP3]], ptr [[TMP4]], align 4 +// CHECK-NEXT: ret void +// +__device__ void test_cvt_scalef32_f16_bf8_hip(v2h* out, v2h src, int src32, float scale) { + *out = __builtin_amdgcn_cvt_scalef32_f16_bf8(src, src32, scale, 0, false); +} + +// CHECK-LABEL: define dso_local void @_Z32test_cvt_scalef32_pk_fp8_f16_hipPDv2_sS_Dv2_DF16_f( +// CHECK-SAME: ptr noundef [[OUT:%.*]], <2 x i16> noundef [[SRC:%.*]], <2 x half> noundef [[A:%.*]], float noundef [[SCALE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <2 x i16>, align 4, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5) +// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr +// CHECK-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x i16> [[SRC]], ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store <2 x half> [[A]], ptr [[A_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store float [[SCALE]], ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i16>, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[A_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.f16(<2 x i16> [[TMP0]], <2 x half> [[TMP1]], float [[TMP2]], i1 false) +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x i16> [[TMP3]], ptr [[TMP4]], align 4 +// CHECK-NEXT: ret void +// +__device__ void test_cvt_scalef32_pk_fp8_f16_hip(v2s* out, v2s src, v2h a, float scale) { + *out = __builtin_amdgcn_cvt_scalef32_pk_fp8_f16(src, a, scale, false); +} + +// CHECK-LABEL: define dso_local void @_Z32test_cvt_scalef32_pk_bf8_f16_hipPDv2_sS_Dv2_DF16_f( +// CHECK-SAME: ptr noundef [[OUT:%.*]], <2 x i16> noundef [[SRC:%.*]], <2 x half> noundef [[A:%.*]], float noundef [[SCALE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <2 x i16>, align 4, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5) +// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr +// CHECK-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x i16> [[SRC]], ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store <2 x half> [[A]], ptr [[A_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store float [[SCALE]], ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i16>, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[A_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.f16(<2 x i16> [[TMP0]], <2 x half> [[TMP1]], float [[TMP2]], i1 false) +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x i16> [[TMP3]], ptr [[TMP4]], align 4 +// CHECK-NEXT: ret void +// +__device__ void test_cvt_scalef32_pk_bf8_f16_hip(v2s* out, v2s src, v2h a, float scale) { + *out = __builtin_amdgcn_cvt_scalef32_pk_bf8_f16(src, a, scale, false); +} + +// CHECK-LABEL: define dso_local void @_Z32test_cvt_scalef32_pk_f16_fp4_hipPDv2_DF16_jf( +// CHECK-SAME: ptr noundef [[OUT:%.*]], i32 noundef [[SRC:%.*]], float noundef [[SCALE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr +// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr +// CHECK-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[SRC]], ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store float [[SCALE]], ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = call contract <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp4(i32 [[TMP0]], float [[TMP1]], i32 0) +// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x half> [[TMP2]], ptr [[TMP3]], align 4 +// CHECK-NEXT: ret void +// +__device__ void test_cvt_scalef32_pk_f16_fp4_hip(v2h* out, unsigned int src, float scale) { + *out = __builtin_amdgcn_cvt_scalef32_pk_f16_fp4(src, scale, 0); +} + +// CHECK-LABEL: define dso_local void @_Z34test_cvt_scalef32_pk32_f16_fp6_hipPDv32_DF16_Dv6_jf( +// CHECK-SAME: ptr noundef [[OUT:%.*]], <6 x i32> noundef [[SRC:%.*]], float noundef [[SCALE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <6 x i32>, align 32, addrspace(5) +// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr +// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr +// CHECK-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <6 x i32> [[SRC]], ptr [[SRC_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store float [[SCALE]], ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load <6 x i32>, ptr [[SRC_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = call contract <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.fp6(<6 x i32> [[TMP0]], float [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <32 x half> [[TMP2]], ptr [[TMP3]], align 64 +// CHECK-NEXT: ret void +// +__device__ void test_cvt_scalef32_pk32_f16_fp6_hip(v32h* out, v6ui src, float scale) { + *out = __builtin_amdgcn_cvt_scalef32_pk32_f16_fp6(src, scale); +} + +// CHECK-LABEL: define dso_local void @_Z34test_cvt_scalef32_pk32_f16_bf6_hipPDv32_DF16_Dv6_jf( +// CHECK-SAME: ptr noundef [[OUT:%.*]], <6 x i32> noundef [[SRC:%.*]], float noundef [[SCALE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <6 x i32>, align 32, addrspace(5) +// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr +// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr +// CHECK-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <6 x i32> [[SRC]], ptr [[SRC_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store float [[SCALE]], ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load <6 x i32>, ptr [[SRC_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = call contract <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.bf6(<6 x i32> [[TMP0]], float [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <32 x half> [[TMP2]], ptr [[TMP3]], align 64 +// CHECK-NEXT: ret void +// +__device__ void test_cvt_scalef32_pk32_f16_bf6_hip(v32h* out, v6ui src, float scale) { + *out = __builtin_amdgcn_cvt_scalef32_pk32_f16_bf6(src, scale); +} + +// CHECK-LABEL: define dso_local void @_Z32test_cvt_scalef32_pk_f16_fp8_hipPDv2_DF16_jf( +// CHECK-SAME: ptr noundef [[OUT:%.*]], i32 noundef [[SRC:%.*]], float noundef [[SCALE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr +// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr +// CHECK-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[SRC]], ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store float [[SCALE]], ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = call contract <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp8(i32 [[TMP0]], float [[TMP1]], i1 false) +// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x half> [[TMP2]], ptr [[TMP3]], align 4 +// CHECK-NEXT: ret void +// +__device__ void test_cvt_scalef32_pk_f16_fp8_hip(v2h* out, unsigned int src, float scale) { + *out = __builtin_amdgcn_cvt_scalef32_pk_f16_fp8(src, scale, false); +} + +// CHECK-LABEL: define dso_local void @_Z32test_cvt_scalef32_pk_f16_bf8_hipPDv2_DF16_jf( +// CHECK-SAME: ptr noundef [[OUT:%.*]], i32 noundef [[SRC:%.*]], float noundef [[SCALE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr +// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr +// CHECK-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[SRC]], ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store float [[SCALE]], ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = call contract <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.bf8(i32 [[TMP0]], float [[TMP1]], i1 false) +// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x half> [[TMP2]], ptr [[TMP3]], align 4 +// CHECK-NEXT: ret void +// +__device__ void test_cvt_scalef32_pk_f16_bf8_hip(v2h* out, unsigned int src, float scale) { + *out = __builtin_amdgcn_cvt_scalef32_pk_f16_bf8(src, scale, false); +} + +// CHECK-LABEL: define dso_local void @_Z32test_cvt_scalef32_pk_fp4_f16_hipPjjDv2_DF16_f( +// CHECK-SAME: ptr noundef [[OUT:%.*]], i32 noundef [[SRC:%.*]], <2 x half> noundef [[A:%.*]], float noundef [[SCALE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5) +// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr +// CHECK-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[SRC]], ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store <2 x half> [[A]], ptr [[A_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store float [[SCALE]], ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[A_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f16(i32 [[TMP0]], <2 x half> [[TMP1]], float [[TMP2]], i32 0) +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4 +// CHECK-NEXT: ret void +// +__device__ void test_cvt_scalef32_pk_fp4_f16_hip(unsigned int* out, unsigned int src, v2h a, float scale) { + *out = __builtin_amdgcn_cvt_scalef32_pk_fp4_f16(src, a, scale, 0); +} + +// CHECK-LABEL: define dso_local void @_Z35test_cvt_scalef32_sr_pk_fp4_f16_hipPjjDv2_DF16_jf( +// CHECK-SAME: ptr noundef [[OUT:%.*]], i32 noundef [[SRC:%.*]], <2 x half> noundef [[A:%.*]], i32 noundef [[SEED:%.*]], float noundef [[SCALE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5) +// CHECK-NEXT: [[SEED_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[SEED_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SEED_ADDR]] to ptr +// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr +// CHECK-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[SRC]], ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store <2 x half> [[A]], ptr [[A_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[SEED]], ptr [[SEED_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store float [[SCALE]], ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[A_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[SEED_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f16(i32 [[TMP0]], <2 x half> [[TMP1]], i32 [[TMP2]], float [[TMP3]], i32 0) +// CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP4]], ptr [[TMP5]], align 4 +// CHECK-NEXT: ret void +// +__device__ void test_cvt_scalef32_sr_pk_fp4_f16_hip(unsigned int* out, unsigned int src, v2h a, unsigned int seed, float scale) { + *out = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f16(src, a, seed, scale, 0); +} + +// CHECK-LABEL: define dso_local void @_Z37test_cvt_scalef32_sr_pk32_bf6_f16_hipPDv6_jDv32_DF16_jf( +// CHECK-SAME: ptr noundef [[OUT:%.*]], <32 x half> noundef [[A:%.*]], i32 noundef [[SEED:%.*]], float noundef [[SCALE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <32 x half>, align 64, addrspace(5) +// CHECK-NEXT: [[SEED_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[SEED_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SEED_ADDR]] to ptr +// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr +// CHECK-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <32 x half> [[A]], ptr [[A_ADDR_ASCAST]], align 64 +// CHECK-NEXT: store i32 [[SEED]], ptr [[SEED_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store float [[SCALE]], ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR_ASCAST]], align 64 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SEED_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.bf6.f16(<32 x half> [[TMP0]], i32 [[TMP1]], float [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <6 x i32> [[TMP3]], ptr [[TMP4]], align 32 +// CHECK-NEXT: ret void +// +__device__ void test_cvt_scalef32_sr_pk32_bf6_f16_hip(v6ui* out, v32h a, unsigned int seed, float scale) { + *out = __builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_f16(a, seed, scale); +} + +// CHECK-LABEL: define dso_local void @_Z37test_cvt_scalef32_sr_pk32_fp6_f16_hipPDv6_jDv32_DF16_jf( +// CHECK-SAME: ptr noundef [[OUT:%.*]], <32 x half> noundef [[A:%.*]], i32 noundef [[SEED:%.*]], float noundef [[SCALE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <32 x half>, align 64, addrspace(5) +// CHECK-NEXT: [[SEED_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[SEED_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SEED_ADDR]] to ptr +// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr +// CHECK-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <32 x half> [[A]], ptr [[A_ADDR_ASCAST]], align 64 +// CHECK-NEXT: store i32 [[SEED]], ptr [[SEED_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store float [[SCALE]], ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR_ASCAST]], align 64 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SEED_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.fp6.f16(<32 x half> [[TMP0]], i32 [[TMP1]], float [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <6 x i32> [[TMP3]], ptr [[TMP4]], align 32 +// CHECK-NEXT: ret void +// +__device__ void test_cvt_scalef32_sr_pk32_fp6_f16_hip(v6ui* out, v32h a, unsigned int seed, float scale) { + *out = __builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_f16(a, seed, scale); +} + +// CHECK-LABEL: define dso_local void @_Z23test_cvt_sr_f16_f32_hipPDv2_DF16_S_fj( +// CHECK-SAME: ptr noundef [[OUT:%.*]], <2 x half> noundef [[SRC:%.*]], float noundef [[A:%.*]], i32 noundef [[SEED:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[SEED_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[SEED_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SEED_ADDR]] to ptr +// CHECK-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x half> [[SRC]], ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store float [[A]], ptr [[A_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[SEED]], ptr [[SEED_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[A_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[SEED_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = call contract <2 x half> @llvm.amdgcn.cvt.sr.f16.f32(<2 x half> [[TMP0]], float [[TMP1]], i32 [[TMP2]], i1 false) +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x half> [[TMP3]], ptr [[TMP4]], align 4 +// CHECK-NEXT: ret void +// +__device__ void test_cvt_sr_f16_f32_hip(v2h* out, v2h src, float a, unsigned int seed) { + *out = __builtin_amdgcn_cvt_sr_f16_f32(src, a, seed, false); +} + _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
