[llvm-branch-commits] [clang] [llvm] [AMDGPU] v_cvt_scalef32_sr_pk16_* gfx1250 instructions (PR #151810)
https://github.com/rampitec updated https://github.com/llvm/llvm-project/pull/151810 >From ff0a7f87901b9dc50e0fd5ec09f6000c25b5e91f Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Sat, 2 Aug 2025 02:11:34 -0700 Subject: [PATCH] [AMDGPU] v_cvt_scalef32_sr_pk16_* gfx1250 instructions --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 6 + .../CodeGenOpenCL/builtins-amdgcn-gfx1250.cl | 42 llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 6 + .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 6 + llvm/lib/Target/AMDGPU/SIInstrInfo.td | 3 + llvm/lib/Target/AMDGPU/VOP3Instructions.td| 12 + .../llvm.amdgcn.cvt.scalef32.sr.pk16.ll | 232 ++ llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s | 36 +++ llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s| 36 +++ .../Disassembler/AMDGPU/gfx1250_dasm_vop3.txt | 36 +++ 10 files changed, 415 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk16.ll diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 9125315310306..ced758c814105 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -746,6 +746,12 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_f32, "V2UiV8fUif", "nc", TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f32, "UiV8fUif", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f16, "UiV8hUif", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_bf16, "UiV8yUif", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_bf16, "V3UiV16yUif", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_f16, "V3UiV16hUif", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_f32, "V3UiV16fUif", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_bf16, "V3UiV16yUif", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_f16, "V3UiV16hUif", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_f32, "V3UiV16fUif", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32_e5m3, "iffiIb", "nc", "fp8e5m3-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32_e5m3, "ifiiIi", "nc", "fp8e5m3-insts") TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_i4_i8, "UsUi", "nc", "gfx1250-insts") diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl index e50ab77f48c79..4ff0571239e71 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl @@ -929,6 +929,42 @@ void test_cvt_scalef32_pk(global uint2 *out2, bfloat8 srcbf8, half8 srch8, float // CHECK-NEXT:[[TMP43:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk8.fp4.bf16(<8 x bfloat> [[TMP40]], i32 [[TMP41]], float [[TMP42]]) // CHECK-NEXT:[[TMP44:%.*]] = load ptr addrspace(1), ptr [[OUT1_ADDR_ASCAST]], align 8 // CHECK-NEXT:store i32 [[TMP43]], ptr addrspace(1) [[TMP44]], align 4 +// CHECK-NEXT:[[TMP45:%.*]] = load <16 x bfloat>, ptr [[SRCBF16_ADDR_ASCAST]], align 32 +// CHECK-NEXT:[[TMP46:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT:[[TMP47:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT:[[TMP48:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.bf16(<16 x bfloat> [[TMP45]], i32 [[TMP46]], float [[TMP47]]) +// CHECK-NEXT:[[TMP49:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8 +// CHECK-NEXT:store <3 x i32> [[TMP48]], ptr addrspace(1) [[TMP49]], align 16 +// CHECK-NEXT:[[TMP50:%.*]] = load <16 x half>, ptr [[SRCH16_ADDR_ASCAST]], align 32 +// CHECK-NEXT:[[TMP51:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT:[[TMP52:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT:[[TMP53:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.f16(<16 x half> [[TMP50]], i32 [[TMP51]], float [[TMP52]]) +// CHECK-NEXT:[[TMP54:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8 +// CHECK-NEXT:store <3 x i32> [[TMP53]], ptr addrspace(1) [[TMP54]], align 16 +// CHECK-NEXT:[[TMP55:%.*]] = load <16 x bfloat>, ptr [[SRCBF16_ADDR_ASCAST]], align 32 +// CHECK-NEXT:[[TMP56:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT:[[TMP57:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT:[[TMP58:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.bf16(<16 x bfloat> [[TMP55]], i32 [[TMP56]], float [[TMP57]]) +// CHECK-NEXT:[[TMP59:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8 +// CHECK-NEXT:store <3 x i32> [[TMP58]], ptr addrspace(1) [[TMP59]], align 16 +// CHECK-NEXT:[[TMP60:%.*]] = load <16 x h
[llvm-branch-commits] [clang] [llvm] [AMDGPU] v_cvt_scalef32_sr_pk16_* gfx1250 instructions (PR #151810)
https://github.com/rampitec updated https://github.com/llvm/llvm-project/pull/151810 >From ff0a7f87901b9dc50e0fd5ec09f6000c25b5e91f Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Sat, 2 Aug 2025 02:11:34 -0700 Subject: [PATCH] [AMDGPU] v_cvt_scalef32_sr_pk16_* gfx1250 instructions --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 6 + .../CodeGenOpenCL/builtins-amdgcn-gfx1250.cl | 42 llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 6 + .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 6 + llvm/lib/Target/AMDGPU/SIInstrInfo.td | 3 + llvm/lib/Target/AMDGPU/VOP3Instructions.td| 12 + .../llvm.amdgcn.cvt.scalef32.sr.pk16.ll | 232 ++ llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s | 36 +++ llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s| 36 +++ .../Disassembler/AMDGPU/gfx1250_dasm_vop3.txt | 36 +++ 10 files changed, 415 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk16.ll diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 9125315310306..ced758c814105 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -746,6 +746,12 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_f32, "V2UiV8fUif", "nc", TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f32, "UiV8fUif", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f16, "UiV8hUif", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_bf16, "UiV8yUif", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_bf16, "V3UiV16yUif", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_f16, "V3UiV16hUif", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_f32, "V3UiV16fUif", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_bf16, "V3UiV16yUif", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_f16, "V3UiV16hUif", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_f32, "V3UiV16fUif", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32_e5m3, "iffiIb", "nc", "fp8e5m3-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32_e5m3, "ifiiIi", "nc", "fp8e5m3-insts") TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_i4_i8, "UsUi", "nc", "gfx1250-insts") diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl index e50ab77f48c79..4ff0571239e71 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl @@ -929,6 +929,42 @@ void test_cvt_scalef32_pk(global uint2 *out2, bfloat8 srcbf8, half8 srch8, float // CHECK-NEXT:[[TMP43:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk8.fp4.bf16(<8 x bfloat> [[TMP40]], i32 [[TMP41]], float [[TMP42]]) // CHECK-NEXT:[[TMP44:%.*]] = load ptr addrspace(1), ptr [[OUT1_ADDR_ASCAST]], align 8 // CHECK-NEXT:store i32 [[TMP43]], ptr addrspace(1) [[TMP44]], align 4 +// CHECK-NEXT:[[TMP45:%.*]] = load <16 x bfloat>, ptr [[SRCBF16_ADDR_ASCAST]], align 32 +// CHECK-NEXT:[[TMP46:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT:[[TMP47:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT:[[TMP48:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.bf16(<16 x bfloat> [[TMP45]], i32 [[TMP46]], float [[TMP47]]) +// CHECK-NEXT:[[TMP49:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8 +// CHECK-NEXT:store <3 x i32> [[TMP48]], ptr addrspace(1) [[TMP49]], align 16 +// CHECK-NEXT:[[TMP50:%.*]] = load <16 x half>, ptr [[SRCH16_ADDR_ASCAST]], align 32 +// CHECK-NEXT:[[TMP51:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT:[[TMP52:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT:[[TMP53:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.f16(<16 x half> [[TMP50]], i32 [[TMP51]], float [[TMP52]]) +// CHECK-NEXT:[[TMP54:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8 +// CHECK-NEXT:store <3 x i32> [[TMP53]], ptr addrspace(1) [[TMP54]], align 16 +// CHECK-NEXT:[[TMP55:%.*]] = load <16 x bfloat>, ptr [[SRCBF16_ADDR_ASCAST]], align 32 +// CHECK-NEXT:[[TMP56:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT:[[TMP57:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT:[[TMP58:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.bf16(<16 x bfloat> [[TMP55]], i32 [[TMP56]], float [[TMP57]]) +// CHECK-NEXT:[[TMP59:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8 +// CHECK-NEXT:store <3 x i32> [[TMP58]], ptr addrspace(1) [[TMP59]], align 16 +// CHECK-NEXT:[[TMP60:%.*]] = load <16 x h
[llvm-branch-commits] [clang] [llvm] [AMDGPU] v_cvt_scalef32_sr_pk16_* gfx1250 instructions (PR #151810)
https://github.com/arsenm approved this pull request. https://github.com/llvm/llvm-project/pull/151810 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] [llvm] [AMDGPU] v_cvt_scalef32_sr_pk16_* gfx1250 instructions (PR #151810)
llvmbot wrote: @llvm/pr-subscribers-clang Author: Stanislav Mekhanoshin (rampitec) Changes --- Patch is 34.66 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/151810.diff 10 Files Affected: - (modified) clang/include/clang/Basic/BuiltinsAMDGPU.def (+6) - (modified) clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl (+42) - (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+6) - (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+6) - (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.td (+3) - (modified) llvm/lib/Target/AMDGPU/VOP3Instructions.td (+12) - (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk16.ll (+232) - (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s (+36) - (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s (+36) - (modified) llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt (+36) ``diff diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 9125315310306..ced758c814105 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -746,6 +746,12 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_f32, "V2UiV8fUif", "nc", TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f32, "UiV8fUif", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f16, "UiV8hUif", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_bf16, "UiV8yUif", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_bf16, "V3UiV16yUif", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_f16, "V3UiV16hUif", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_f32, "V3UiV16fUif", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_bf16, "V3UiV16yUif", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_f16, "V3UiV16hUif", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_f32, "V3UiV16fUif", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32_e5m3, "iffiIb", "nc", "fp8e5m3-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32_e5m3, "ifiiIi", "nc", "fp8e5m3-insts") TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_i4_i8, "UsUi", "nc", "gfx1250-insts") diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl index e50ab77f48c79..4ff0571239e71 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl @@ -929,6 +929,42 @@ void test_cvt_scalef32_pk(global uint2 *out2, bfloat8 srcbf8, half8 srch8, float // CHECK-NEXT:[[TMP43:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk8.fp4.bf16(<8 x bfloat> [[TMP40]], i32 [[TMP41]], float [[TMP42]]) // CHECK-NEXT:[[TMP44:%.*]] = load ptr addrspace(1), ptr [[OUT1_ADDR_ASCAST]], align 8 // CHECK-NEXT:store i32 [[TMP43]], ptr addrspace(1) [[TMP44]], align 4 +// CHECK-NEXT:[[TMP45:%.*]] = load <16 x bfloat>, ptr [[SRCBF16_ADDR_ASCAST]], align 32 +// CHECK-NEXT:[[TMP46:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT:[[TMP47:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT:[[TMP48:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.bf16(<16 x bfloat> [[TMP45]], i32 [[TMP46]], float [[TMP47]]) +// CHECK-NEXT:[[TMP49:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8 +// CHECK-NEXT:store <3 x i32> [[TMP48]], ptr addrspace(1) [[TMP49]], align 16 +// CHECK-NEXT:[[TMP50:%.*]] = load <16 x half>, ptr [[SRCH16_ADDR_ASCAST]], align 32 +// CHECK-NEXT:[[TMP51:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT:[[TMP52:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT:[[TMP53:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.f16(<16 x half> [[TMP50]], i32 [[TMP51]], float [[TMP52]]) +// CHECK-NEXT:[[TMP54:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8 +// CHECK-NEXT:store <3 x i32> [[TMP53]], ptr addrspace(1) [[TMP54]], align 16 +// CHECK-NEXT:[[TMP55:%.*]] = load <16 x bfloat>, ptr [[SRCBF16_ADDR_ASCAST]], align 32 +// CHECK-NEXT:[[TMP56:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT:[[TMP57:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT:[[TMP58:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.bf16(<16 x bfloat> [[TMP55]], i32 [[TMP56]], float [[TMP57]]) +// CHECK-NEXT:[[TMP59:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8 +// CHECK-NEXT:store <3 x i32> [[TMP58]], ptr addrspace(1) [[TMP59]], align 16 +// CHECK-NEXT:[[TMP60:%.*]] = load <16 x half>, ptr [[SRCH16_ADDR_ASCAST]], align 32 +// CHECK-NEXT:[[TMP6
[llvm-branch-commits] [clang] [llvm] [AMDGPU] v_cvt_scalef32_sr_pk16_* gfx1250 instructions (PR #151810)
llvmbot wrote: @llvm/pr-subscribers-llvm-ir @llvm/pr-subscribers-backend-amdgpu Author: Stanislav Mekhanoshin (rampitec) Changes --- Patch is 34.66 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/151810.diff 10 Files Affected: - (modified) clang/include/clang/Basic/BuiltinsAMDGPU.def (+6) - (modified) clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl (+42) - (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+6) - (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+6) - (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.td (+3) - (modified) llvm/lib/Target/AMDGPU/VOP3Instructions.td (+12) - (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk16.ll (+232) - (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s (+36) - (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s (+36) - (modified) llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt (+36) ``diff diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 9125315310306..ced758c814105 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -746,6 +746,12 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_f32, "V2UiV8fUif", "nc", TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f32, "UiV8fUif", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f16, "UiV8hUif", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_bf16, "UiV8yUif", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_bf16, "V3UiV16yUif", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_f16, "V3UiV16hUif", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_f32, "V3UiV16fUif", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_bf16, "V3UiV16yUif", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_f16, "V3UiV16hUif", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_f32, "V3UiV16fUif", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32_e5m3, "iffiIb", "nc", "fp8e5m3-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32_e5m3, "ifiiIi", "nc", "fp8e5m3-insts") TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_i4_i8, "UsUi", "nc", "gfx1250-insts") diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl index e50ab77f48c79..4ff0571239e71 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl @@ -929,6 +929,42 @@ void test_cvt_scalef32_pk(global uint2 *out2, bfloat8 srcbf8, half8 srch8, float // CHECK-NEXT:[[TMP43:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk8.fp4.bf16(<8 x bfloat> [[TMP40]], i32 [[TMP41]], float [[TMP42]]) // CHECK-NEXT:[[TMP44:%.*]] = load ptr addrspace(1), ptr [[OUT1_ADDR_ASCAST]], align 8 // CHECK-NEXT:store i32 [[TMP43]], ptr addrspace(1) [[TMP44]], align 4 +// CHECK-NEXT:[[TMP45:%.*]] = load <16 x bfloat>, ptr [[SRCBF16_ADDR_ASCAST]], align 32 +// CHECK-NEXT:[[TMP46:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT:[[TMP47:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT:[[TMP48:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.bf16(<16 x bfloat> [[TMP45]], i32 [[TMP46]], float [[TMP47]]) +// CHECK-NEXT:[[TMP49:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8 +// CHECK-NEXT:store <3 x i32> [[TMP48]], ptr addrspace(1) [[TMP49]], align 16 +// CHECK-NEXT:[[TMP50:%.*]] = load <16 x half>, ptr [[SRCH16_ADDR_ASCAST]], align 32 +// CHECK-NEXT:[[TMP51:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT:[[TMP52:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT:[[TMP53:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.f16(<16 x half> [[TMP50]], i32 [[TMP51]], float [[TMP52]]) +// CHECK-NEXT:[[TMP54:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8 +// CHECK-NEXT:store <3 x i32> [[TMP53]], ptr addrspace(1) [[TMP54]], align 16 +// CHECK-NEXT:[[TMP55:%.*]] = load <16 x bfloat>, ptr [[SRCBF16_ADDR_ASCAST]], align 32 +// CHECK-NEXT:[[TMP56:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT:[[TMP57:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT:[[TMP58:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.bf16(<16 x bfloat> [[TMP55]], i32 [[TMP56]], float [[TMP57]]) +// CHECK-NEXT:[[TMP59:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8 +// CHECK-NEXT:store <3 x i32> [[TMP58]], ptr addrspace(1) [[TMP59]], align 16 +// CHECK-NEXT:[[TMP60:%.*]] = load <16 x half>, ptr [[SRCH16_ADDR_ASCAST
[llvm-branch-commits] [clang] [llvm] [AMDGPU] v_cvt_scalef32_sr_pk16_* gfx1250 instructions (PR #151810)
llvmbot wrote: @llvm/pr-subscribers-mc Author: Stanislav Mekhanoshin (rampitec) Changes --- Patch is 34.66 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/151810.diff 10 Files Affected: - (modified) clang/include/clang/Basic/BuiltinsAMDGPU.def (+6) - (modified) clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl (+42) - (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+6) - (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+6) - (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.td (+3) - (modified) llvm/lib/Target/AMDGPU/VOP3Instructions.td (+12) - (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk16.ll (+232) - (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s (+36) - (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s (+36) - (modified) llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt (+36) ``diff diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 9125315310306..ced758c814105 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -746,6 +746,12 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_f32, "V2UiV8fUif", "nc", TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f32, "UiV8fUif", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f16, "UiV8hUif", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_bf16, "UiV8yUif", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_bf16, "V3UiV16yUif", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_f16, "V3UiV16hUif", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_f32, "V3UiV16fUif", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_bf16, "V3UiV16yUif", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_f16, "V3UiV16hUif", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_f32, "V3UiV16fUif", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32_e5m3, "iffiIb", "nc", "fp8e5m3-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32_e5m3, "ifiiIi", "nc", "fp8e5m3-insts") TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_i4_i8, "UsUi", "nc", "gfx1250-insts") diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl index e50ab77f48c79..4ff0571239e71 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl @@ -929,6 +929,42 @@ void test_cvt_scalef32_pk(global uint2 *out2, bfloat8 srcbf8, half8 srch8, float // CHECK-NEXT:[[TMP43:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk8.fp4.bf16(<8 x bfloat> [[TMP40]], i32 [[TMP41]], float [[TMP42]]) // CHECK-NEXT:[[TMP44:%.*]] = load ptr addrspace(1), ptr [[OUT1_ADDR_ASCAST]], align 8 // CHECK-NEXT:store i32 [[TMP43]], ptr addrspace(1) [[TMP44]], align 4 +// CHECK-NEXT:[[TMP45:%.*]] = load <16 x bfloat>, ptr [[SRCBF16_ADDR_ASCAST]], align 32 +// CHECK-NEXT:[[TMP46:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT:[[TMP47:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT:[[TMP48:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.bf16(<16 x bfloat> [[TMP45]], i32 [[TMP46]], float [[TMP47]]) +// CHECK-NEXT:[[TMP49:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8 +// CHECK-NEXT:store <3 x i32> [[TMP48]], ptr addrspace(1) [[TMP49]], align 16 +// CHECK-NEXT:[[TMP50:%.*]] = load <16 x half>, ptr [[SRCH16_ADDR_ASCAST]], align 32 +// CHECK-NEXT:[[TMP51:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT:[[TMP52:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT:[[TMP53:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.f16(<16 x half> [[TMP50]], i32 [[TMP51]], float [[TMP52]]) +// CHECK-NEXT:[[TMP54:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8 +// CHECK-NEXT:store <3 x i32> [[TMP53]], ptr addrspace(1) [[TMP54]], align 16 +// CHECK-NEXT:[[TMP55:%.*]] = load <16 x bfloat>, ptr [[SRCBF16_ADDR_ASCAST]], align 32 +// CHECK-NEXT:[[TMP56:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT:[[TMP57:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT:[[TMP58:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.bf16(<16 x bfloat> [[TMP55]], i32 [[TMP56]], float [[TMP57]]) +// CHECK-NEXT:[[TMP59:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8 +// CHECK-NEXT:store <3 x i32> [[TMP58]], ptr addrspace(1) [[TMP59]], align 16 +// CHECK-NEXT:[[TMP60:%.*]] = load <16 x half>, ptr [[SRCH16_ADDR_ASCAST]], align 32 +// CHECK-NEXT:[[TMP61:%
[llvm-branch-commits] [clang] [llvm] [AMDGPU] v_cvt_scalef32_sr_pk16_* gfx1250 instructions (PR #151810)
https://github.com/rampitec ready_for_review https://github.com/llvm/llvm-project/pull/151810 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] [llvm] [AMDGPU] v_cvt_scalef32_sr_pk16_* gfx1250 instructions (PR #151810)
https://github.com/rampitec updated https://github.com/llvm/llvm-project/pull/151810 >From dad0929b323eb7dde3211a43a4b14170fee5d56c Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Sat, 2 Aug 2025 02:11:34 -0700 Subject: [PATCH] [AMDGPU] v_cvt_scalef32_sr_pk16_* gfx1250 instructions --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 6 + .../CodeGenOpenCL/builtins-amdgcn-gfx1250.cl | 42 llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 6 + .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 6 + llvm/lib/Target/AMDGPU/SIInstrInfo.td | 3 + llvm/lib/Target/AMDGPU/VOP3Instructions.td| 12 + .../llvm.amdgcn.cvt.scalef32.sr.pk16.ll | 232 ++ llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s | 36 +++ llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s| 36 +++ .../Disassembler/AMDGPU/gfx1250_dasm_vop3.txt | 36 +++ 10 files changed, 415 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk16.ll diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 9125315310306..ced758c814105 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -746,6 +746,12 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_f32, "V2UiV8fUif", "nc", TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f32, "UiV8fUif", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f16, "UiV8hUif", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_bf16, "UiV8yUif", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_bf16, "V3UiV16yUif", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_f16, "V3UiV16hUif", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_f32, "V3UiV16fUif", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_bf16, "V3UiV16yUif", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_f16, "V3UiV16hUif", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_f32, "V3UiV16fUif", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32_e5m3, "iffiIb", "nc", "fp8e5m3-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32_e5m3, "ifiiIi", "nc", "fp8e5m3-insts") TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_i4_i8, "UsUi", "nc", "gfx1250-insts") diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl index e50ab77f48c79..4ff0571239e71 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl @@ -929,6 +929,42 @@ void test_cvt_scalef32_pk(global uint2 *out2, bfloat8 srcbf8, half8 srch8, float // CHECK-NEXT:[[TMP43:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk8.fp4.bf16(<8 x bfloat> [[TMP40]], i32 [[TMP41]], float [[TMP42]]) // CHECK-NEXT:[[TMP44:%.*]] = load ptr addrspace(1), ptr [[OUT1_ADDR_ASCAST]], align 8 // CHECK-NEXT:store i32 [[TMP43]], ptr addrspace(1) [[TMP44]], align 4 +// CHECK-NEXT:[[TMP45:%.*]] = load <16 x bfloat>, ptr [[SRCBF16_ADDR_ASCAST]], align 32 +// CHECK-NEXT:[[TMP46:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT:[[TMP47:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT:[[TMP48:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.bf16(<16 x bfloat> [[TMP45]], i32 [[TMP46]], float [[TMP47]]) +// CHECK-NEXT:[[TMP49:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8 +// CHECK-NEXT:store <3 x i32> [[TMP48]], ptr addrspace(1) [[TMP49]], align 16 +// CHECK-NEXT:[[TMP50:%.*]] = load <16 x half>, ptr [[SRCH16_ADDR_ASCAST]], align 32 +// CHECK-NEXT:[[TMP51:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT:[[TMP52:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT:[[TMP53:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.f16(<16 x half> [[TMP50]], i32 [[TMP51]], float [[TMP52]]) +// CHECK-NEXT:[[TMP54:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8 +// CHECK-NEXT:store <3 x i32> [[TMP53]], ptr addrspace(1) [[TMP54]], align 16 +// CHECK-NEXT:[[TMP55:%.*]] = load <16 x bfloat>, ptr [[SRCBF16_ADDR_ASCAST]], align 32 +// CHECK-NEXT:[[TMP56:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT:[[TMP57:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT:[[TMP58:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.bf16(<16 x bfloat> [[TMP55]], i32 [[TMP56]], float [[TMP57]]) +// CHECK-NEXT:[[TMP59:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8 +// CHECK-NEXT:store <3 x i32> [[TMP58]], ptr addrspace(1) [[TMP59]], align 16 +// CHECK-NEXT:[[TMP60:%.*]] = load <16 x h
[llvm-branch-commits] [clang] [llvm] [AMDGPU] v_cvt_scalef32_sr_pk16_* gfx1250 instructions (PR #151810)
https://github.com/rampitec updated https://github.com/llvm/llvm-project/pull/151810 >From dad0929b323eb7dde3211a43a4b14170fee5d56c Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Sat, 2 Aug 2025 02:11:34 -0700 Subject: [PATCH] [AMDGPU] v_cvt_scalef32_sr_pk16_* gfx1250 instructions --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 6 + .../CodeGenOpenCL/builtins-amdgcn-gfx1250.cl | 42 llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 6 + .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 6 + llvm/lib/Target/AMDGPU/SIInstrInfo.td | 3 + llvm/lib/Target/AMDGPU/VOP3Instructions.td| 12 + .../llvm.amdgcn.cvt.scalef32.sr.pk16.ll | 232 ++ llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s | 36 +++ llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s| 36 +++ .../Disassembler/AMDGPU/gfx1250_dasm_vop3.txt | 36 +++ 10 files changed, 415 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk16.ll diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 9125315310306..ced758c814105 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -746,6 +746,12 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_f32, "V2UiV8fUif", "nc", TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f32, "UiV8fUif", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f16, "UiV8hUif", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_bf16, "UiV8yUif", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_bf16, "V3UiV16yUif", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_f16, "V3UiV16hUif", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_f32, "V3UiV16fUif", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_bf16, "V3UiV16yUif", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_f16, "V3UiV16hUif", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_f32, "V3UiV16fUif", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32_e5m3, "iffiIb", "nc", "fp8e5m3-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32_e5m3, "ifiiIi", "nc", "fp8e5m3-insts") TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_i4_i8, "UsUi", "nc", "gfx1250-insts") diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl index e50ab77f48c79..4ff0571239e71 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl @@ -929,6 +929,42 @@ void test_cvt_scalef32_pk(global uint2 *out2, bfloat8 srcbf8, half8 srch8, float // CHECK-NEXT:[[TMP43:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk8.fp4.bf16(<8 x bfloat> [[TMP40]], i32 [[TMP41]], float [[TMP42]]) // CHECK-NEXT:[[TMP44:%.*]] = load ptr addrspace(1), ptr [[OUT1_ADDR_ASCAST]], align 8 // CHECK-NEXT:store i32 [[TMP43]], ptr addrspace(1) [[TMP44]], align 4 +// CHECK-NEXT:[[TMP45:%.*]] = load <16 x bfloat>, ptr [[SRCBF16_ADDR_ASCAST]], align 32 +// CHECK-NEXT:[[TMP46:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT:[[TMP47:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT:[[TMP48:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.bf16(<16 x bfloat> [[TMP45]], i32 [[TMP46]], float [[TMP47]]) +// CHECK-NEXT:[[TMP49:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8 +// CHECK-NEXT:store <3 x i32> [[TMP48]], ptr addrspace(1) [[TMP49]], align 16 +// CHECK-NEXT:[[TMP50:%.*]] = load <16 x half>, ptr [[SRCH16_ADDR_ASCAST]], align 32 +// CHECK-NEXT:[[TMP51:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT:[[TMP52:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT:[[TMP53:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.f16(<16 x half> [[TMP50]], i32 [[TMP51]], float [[TMP52]]) +// CHECK-NEXT:[[TMP54:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8 +// CHECK-NEXT:store <3 x i32> [[TMP53]], ptr addrspace(1) [[TMP54]], align 16 +// CHECK-NEXT:[[TMP55:%.*]] = load <16 x bfloat>, ptr [[SRCBF16_ADDR_ASCAST]], align 32 +// CHECK-NEXT:[[TMP56:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT:[[TMP57:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT:[[TMP58:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.bf16(<16 x bfloat> [[TMP55]], i32 [[TMP56]], float [[TMP57]]) +// CHECK-NEXT:[[TMP59:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8 +// CHECK-NEXT:store <3 x i32> [[TMP58]], ptr addrspace(1) [[TMP59]], align 16 +// CHECK-NEXT:[[TMP60:%.*]] = load <16 x h
[llvm-branch-commits] [clang] [llvm] [AMDGPU] v_cvt_scalef32_sr_pk16_* gfx1250 instructions (PR #151810)
rampitec wrote: > [!WARNING] > This pull request is not mergeable via GitHub because a downstack PR is > open. Once all requirements are satisfied, merge this PR as a stack href="https://app.graphite.dev/github/pr/llvm/llvm-project/151810?utm_source=stack-comment-downstack-mergeability-warning"; > >on Graphite. > https://graphite.dev/docs/merge-pull-requests";>Learn more * **#151810** https://app.graphite.dev/github/pr/llvm/llvm-project/151810?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/151810?utm_source=stack-comment-view-in-graphite"; target="_blank">(View in Graphite) * **#151807** https://app.graphite.dev/github/pr/llvm/llvm-project/151807?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#151804** https://app.graphite.dev/github/pr/llvm/llvm-project/151804?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * `main` This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn more about https://stacking.dev/?utm_source=stack-comment";>stacking. https://github.com/llvm/llvm-project/pull/151810 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] [llvm] [AMDGPU] v_cvt_scalef32_sr_pk16_* gfx1250 instructions (PR #151810)
https://github.com/rampitec created https://github.com/llvm/llvm-project/pull/151810 None >From dfe439a1ed94031e238a3acd558cb0035b74e97b Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Sat, 2 Aug 2025 02:11:34 -0700 Subject: [PATCH] [AMDGPU] v_cvt_scalef32_sr_pk16_* gfx1250 instructions --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 6 + .../CodeGenOpenCL/builtins-amdgcn-gfx1250.cl | 42 llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 6 + .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 6 + llvm/lib/Target/AMDGPU/SIInstrInfo.td | 3 + llvm/lib/Target/AMDGPU/VOP3Instructions.td| 12 + .../llvm.amdgcn.cvt.scalef32.sr.pk16.ll | 232 ++ llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s | 36 +++ llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s| 36 +++ .../Disassembler/AMDGPU/gfx1250_dasm_vop3.txt | 36 +++ 10 files changed, 415 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk16.ll diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 9125315310306..ced758c814105 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -746,6 +746,12 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_f32, "V2UiV8fUif", "nc", TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f32, "UiV8fUif", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f16, "UiV8hUif", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_bf16, "UiV8yUif", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_bf16, "V3UiV16yUif", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_f16, "V3UiV16hUif", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_f32, "V3UiV16fUif", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_bf16, "V3UiV16yUif", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_f16, "V3UiV16hUif", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_f32, "V3UiV16fUif", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32_e5m3, "iffiIb", "nc", "fp8e5m3-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32_e5m3, "ifiiIi", "nc", "fp8e5m3-insts") TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_i4_i8, "UsUi", "nc", "gfx1250-insts") diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl index e50ab77f48c79..4ff0571239e71 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl @@ -929,6 +929,42 @@ void test_cvt_scalef32_pk(global uint2 *out2, bfloat8 srcbf8, half8 srch8, float // CHECK-NEXT:[[TMP43:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk8.fp4.bf16(<8 x bfloat> [[TMP40]], i32 [[TMP41]], float [[TMP42]]) // CHECK-NEXT:[[TMP44:%.*]] = load ptr addrspace(1), ptr [[OUT1_ADDR_ASCAST]], align 8 // CHECK-NEXT:store i32 [[TMP43]], ptr addrspace(1) [[TMP44]], align 4 +// CHECK-NEXT:[[TMP45:%.*]] = load <16 x bfloat>, ptr [[SRCBF16_ADDR_ASCAST]], align 32 +// CHECK-NEXT:[[TMP46:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT:[[TMP47:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT:[[TMP48:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.bf16(<16 x bfloat> [[TMP45]], i32 [[TMP46]], float [[TMP47]]) +// CHECK-NEXT:[[TMP49:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8 +// CHECK-NEXT:store <3 x i32> [[TMP48]], ptr addrspace(1) [[TMP49]], align 16 +// CHECK-NEXT:[[TMP50:%.*]] = load <16 x half>, ptr [[SRCH16_ADDR_ASCAST]], align 32 +// CHECK-NEXT:[[TMP51:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT:[[TMP52:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT:[[TMP53:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.f16(<16 x half> [[TMP50]], i32 [[TMP51]], float [[TMP52]]) +// CHECK-NEXT:[[TMP54:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8 +// CHECK-NEXT:store <3 x i32> [[TMP53]], ptr addrspace(1) [[TMP54]], align 16 +// CHECK-NEXT:[[TMP55:%.*]] = load <16 x bfloat>, ptr [[SRCBF16_ADDR_ASCAST]], align 32 +// CHECK-NEXT:[[TMP56:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT:[[TMP57:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT:[[TMP58:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.bf16(<16 x bfloat> [[TMP55]], i32 [[TMP56]], float [[TMP57]]) +// CHECK-NEXT:[[TMP59:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8 +// CHECK-NEXT:store <3 x i32> [[TMP58]], ptr addrspace(1) [[TMP59]], align 16 +// CHECK-NEXT:[[TMP60:%.*]] = load <
