[llvm-branch-commits] [clang] [llvm] [AMDGPU] v_cvt_scalef32_sr_pk16_* gfx1250 instructions (PR #151810)

2025-08-02 Thread Stanislav Mekhanoshin via llvm-branch-commits

https://github.com/rampitec updated 
https://github.com/llvm/llvm-project/pull/151810

>From ff0a7f87901b9dc50e0fd5ec09f6000c25b5e91f Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin 
Date: Sat, 2 Aug 2025 02:11:34 -0700
Subject: [PATCH] [AMDGPU] v_cvt_scalef32_sr_pk16_* gfx1250 instructions

---
 clang/include/clang/Basic/BuiltinsAMDGPU.def  |   6 +
 .../CodeGenOpenCL/builtins-amdgcn-gfx1250.cl  |  42 
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |   6 +
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |   6 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.td |   3 +
 llvm/lib/Target/AMDGPU/VOP3Instructions.td|  12 +
 .../llvm.amdgcn.cvt.scalef32.sr.pk16.ll   | 232 ++
 llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s |  36 +++
 llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s|  36 +++
 .../Disassembler/AMDGPU/gfx1250_dasm_vop3.txt |  36 +++
 10 files changed, 415 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk16.ll

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 9125315310306..ced758c814105 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -746,6 +746,12 @@ 
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_f32, "V2UiV8fUif", "nc",
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f32, "UiV8fUif", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f16, "UiV8hUif", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_bf16, "UiV8yUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_bf16, "V3UiV16yUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_f16, "V3UiV16hUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_f32, "V3UiV16fUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_bf16, "V3UiV16yUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_f16, "V3UiV16hUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_f32, "V3UiV16fUif", 
"nc", "gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32_e5m3, "iffiIb", "nc", 
"fp8e5m3-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32_e5m3, "ifiiIi", "nc", 
"fp8e5m3-insts")
 TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_i4_i8, "UsUi", "nc", "gfx1250-insts")
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index e50ab77f48c79..4ff0571239e71 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -929,6 +929,42 @@ void test_cvt_scalef32_pk(global uint2 *out2, bfloat8 
srcbf8, half8 srch8, float
 // CHECK-NEXT:[[TMP43:%.*]] = call i32 
@llvm.amdgcn.cvt.scalef32.sr.pk8.fp4.bf16(<8 x bfloat> [[TMP40]], i32 
[[TMP41]], float [[TMP42]])
 // CHECK-NEXT:[[TMP44:%.*]] = load ptr addrspace(1), ptr 
[[OUT1_ADDR_ASCAST]], align 8
 // CHECK-NEXT:store i32 [[TMP43]], ptr addrspace(1) [[TMP44]], align 4
+// CHECK-NEXT:[[TMP45:%.*]] = load <16 x bfloat>, ptr 
[[SRCBF16_ADDR_ASCAST]], align 32
+// CHECK-NEXT:[[TMP46:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
+// CHECK-NEXT:[[TMP47:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 
4
+// CHECK-NEXT:[[TMP48:%.*]] = call <3 x i32> 
@llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.bf16(<16 x bfloat> [[TMP45]], i32 
[[TMP46]], float [[TMP47]])
+// CHECK-NEXT:[[TMP49:%.*]] = load ptr addrspace(1), ptr 
[[OUT3_ADDR_ASCAST]], align 8
+// CHECK-NEXT:store <3 x i32> [[TMP48]], ptr addrspace(1) [[TMP49]], align 
16
+// CHECK-NEXT:[[TMP50:%.*]] = load <16 x half>, ptr 
[[SRCH16_ADDR_ASCAST]], align 32
+// CHECK-NEXT:[[TMP51:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
+// CHECK-NEXT:[[TMP52:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 
4
+// CHECK-NEXT:[[TMP53:%.*]] = call <3 x i32> 
@llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.f16(<16 x half> [[TMP50]], i32 [[TMP51]], 
float [[TMP52]])
+// CHECK-NEXT:[[TMP54:%.*]] = load ptr addrspace(1), ptr 
[[OUT3_ADDR_ASCAST]], align 8
+// CHECK-NEXT:store <3 x i32> [[TMP53]], ptr addrspace(1) [[TMP54]], align 
16
+// CHECK-NEXT:[[TMP55:%.*]] = load <16 x bfloat>, ptr 
[[SRCBF16_ADDR_ASCAST]], align 32
+// CHECK-NEXT:[[TMP56:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
+// CHECK-NEXT:[[TMP57:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 
4
+// CHECK-NEXT:[[TMP58:%.*]] = call <3 x i32> 
@llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.bf16(<16 x bfloat> [[TMP55]], i32 
[[TMP56]], float [[TMP57]])
+// CHECK-NEXT:[[TMP59:%.*]] = load ptr addrspace(1), ptr 
[[OUT3_ADDR_ASCAST]], align 8
+// CHECK-NEXT:store <3 x i32> [[TMP58]], ptr addrspace(1) [[TMP59]], align 
16
+// CHECK-NEXT:[[TMP60:%.*]] = load <16 x h

[llvm-branch-commits] [clang] [llvm] [AMDGPU] v_cvt_scalef32_sr_pk16_* gfx1250 instructions (PR #151810)

2025-08-02 Thread Stanislav Mekhanoshin via llvm-branch-commits

https://github.com/rampitec updated 
https://github.com/llvm/llvm-project/pull/151810

>From ff0a7f87901b9dc50e0fd5ec09f6000c25b5e91f Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin 
Date: Sat, 2 Aug 2025 02:11:34 -0700
Subject: [PATCH] [AMDGPU] v_cvt_scalef32_sr_pk16_* gfx1250 instructions

---
 clang/include/clang/Basic/BuiltinsAMDGPU.def  |   6 +
 .../CodeGenOpenCL/builtins-amdgcn-gfx1250.cl  |  42 
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |   6 +
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |   6 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.td |   3 +
 llvm/lib/Target/AMDGPU/VOP3Instructions.td|  12 +
 .../llvm.amdgcn.cvt.scalef32.sr.pk16.ll   | 232 ++
 llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s |  36 +++
 llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s|  36 +++
 .../Disassembler/AMDGPU/gfx1250_dasm_vop3.txt |  36 +++
 10 files changed, 415 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk16.ll

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 9125315310306..ced758c814105 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -746,6 +746,12 @@ 
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_f32, "V2UiV8fUif", "nc",
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f32, "UiV8fUif", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f16, "UiV8hUif", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_bf16, "UiV8yUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_bf16, "V3UiV16yUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_f16, "V3UiV16hUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_f32, "V3UiV16fUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_bf16, "V3UiV16yUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_f16, "V3UiV16hUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_f32, "V3UiV16fUif", 
"nc", "gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32_e5m3, "iffiIb", "nc", 
"fp8e5m3-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32_e5m3, "ifiiIi", "nc", 
"fp8e5m3-insts")
 TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_i4_i8, "UsUi", "nc", "gfx1250-insts")
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index e50ab77f48c79..4ff0571239e71 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -929,6 +929,42 @@ void test_cvt_scalef32_pk(global uint2 *out2, bfloat8 
srcbf8, half8 srch8, float
 // CHECK-NEXT:[[TMP43:%.*]] = call i32 
@llvm.amdgcn.cvt.scalef32.sr.pk8.fp4.bf16(<8 x bfloat> [[TMP40]], i32 
[[TMP41]], float [[TMP42]])
 // CHECK-NEXT:[[TMP44:%.*]] = load ptr addrspace(1), ptr 
[[OUT1_ADDR_ASCAST]], align 8
 // CHECK-NEXT:store i32 [[TMP43]], ptr addrspace(1) [[TMP44]], align 4
+// CHECK-NEXT:[[TMP45:%.*]] = load <16 x bfloat>, ptr 
[[SRCBF16_ADDR_ASCAST]], align 32
+// CHECK-NEXT:[[TMP46:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
+// CHECK-NEXT:[[TMP47:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 
4
+// CHECK-NEXT:[[TMP48:%.*]] = call <3 x i32> 
@llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.bf16(<16 x bfloat> [[TMP45]], i32 
[[TMP46]], float [[TMP47]])
+// CHECK-NEXT:[[TMP49:%.*]] = load ptr addrspace(1), ptr 
[[OUT3_ADDR_ASCAST]], align 8
+// CHECK-NEXT:store <3 x i32> [[TMP48]], ptr addrspace(1) [[TMP49]], align 
16
+// CHECK-NEXT:[[TMP50:%.*]] = load <16 x half>, ptr 
[[SRCH16_ADDR_ASCAST]], align 32
+// CHECK-NEXT:[[TMP51:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
+// CHECK-NEXT:[[TMP52:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 
4
+// CHECK-NEXT:[[TMP53:%.*]] = call <3 x i32> 
@llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.f16(<16 x half> [[TMP50]], i32 [[TMP51]], 
float [[TMP52]])
+// CHECK-NEXT:[[TMP54:%.*]] = load ptr addrspace(1), ptr 
[[OUT3_ADDR_ASCAST]], align 8
+// CHECK-NEXT:store <3 x i32> [[TMP53]], ptr addrspace(1) [[TMP54]], align 
16
+// CHECK-NEXT:[[TMP55:%.*]] = load <16 x bfloat>, ptr 
[[SRCBF16_ADDR_ASCAST]], align 32
+// CHECK-NEXT:[[TMP56:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
+// CHECK-NEXT:[[TMP57:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 
4
+// CHECK-NEXT:[[TMP58:%.*]] = call <3 x i32> 
@llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.bf16(<16 x bfloat> [[TMP55]], i32 
[[TMP56]], float [[TMP57]])
+// CHECK-NEXT:[[TMP59:%.*]] = load ptr addrspace(1), ptr 
[[OUT3_ADDR_ASCAST]], align 8
+// CHECK-NEXT:store <3 x i32> [[TMP58]], ptr addrspace(1) [[TMP59]], align 
16
+// CHECK-NEXT:[[TMP60:%.*]] = load <16 x h

[llvm-branch-commits] [clang] [llvm] [AMDGPU] v_cvt_scalef32_sr_pk16_* gfx1250 instructions (PR #151810)

2025-08-02 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm approved this pull request.


https://github.com/llvm/llvm-project/pull/151810
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] [llvm] [AMDGPU] v_cvt_scalef32_sr_pk16_* gfx1250 instructions (PR #151810)

2025-08-02 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-clang

Author: Stanislav Mekhanoshin (rampitec)


Changes



---

Patch is 34.66 KiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/151810.diff


10 Files Affected:

- (modified) clang/include/clang/Basic/BuiltinsAMDGPU.def (+6) 
- (modified) clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl (+42) 
- (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+6) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+6) 
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.td (+3) 
- (modified) llvm/lib/Target/AMDGPU/VOP3Instructions.td (+12) 
- (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk16.ll (+232) 
- (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s (+36) 
- (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s (+36) 
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt (+36) 


``diff
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 9125315310306..ced758c814105 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -746,6 +746,12 @@ 
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_f32, "V2UiV8fUif", "nc",
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f32, "UiV8fUif", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f16, "UiV8hUif", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_bf16, "UiV8yUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_bf16, "V3UiV16yUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_f16, "V3UiV16hUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_f32, "V3UiV16fUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_bf16, "V3UiV16yUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_f16, "V3UiV16hUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_f32, "V3UiV16fUif", 
"nc", "gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32_e5m3, "iffiIb", "nc", 
"fp8e5m3-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32_e5m3, "ifiiIi", "nc", 
"fp8e5m3-insts")
 TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_i4_i8, "UsUi", "nc", "gfx1250-insts")
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index e50ab77f48c79..4ff0571239e71 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -929,6 +929,42 @@ void test_cvt_scalef32_pk(global uint2 *out2, bfloat8 
srcbf8, half8 srch8, float
 // CHECK-NEXT:[[TMP43:%.*]] = call i32 
@llvm.amdgcn.cvt.scalef32.sr.pk8.fp4.bf16(<8 x bfloat> [[TMP40]], i32 
[[TMP41]], float [[TMP42]])
 // CHECK-NEXT:[[TMP44:%.*]] = load ptr addrspace(1), ptr 
[[OUT1_ADDR_ASCAST]], align 8
 // CHECK-NEXT:store i32 [[TMP43]], ptr addrspace(1) [[TMP44]], align 4
+// CHECK-NEXT:[[TMP45:%.*]] = load <16 x bfloat>, ptr 
[[SRCBF16_ADDR_ASCAST]], align 32
+// CHECK-NEXT:[[TMP46:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
+// CHECK-NEXT:[[TMP47:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 
4
+// CHECK-NEXT:[[TMP48:%.*]] = call <3 x i32> 
@llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.bf16(<16 x bfloat> [[TMP45]], i32 
[[TMP46]], float [[TMP47]])
+// CHECK-NEXT:[[TMP49:%.*]] = load ptr addrspace(1), ptr 
[[OUT3_ADDR_ASCAST]], align 8
+// CHECK-NEXT:store <3 x i32> [[TMP48]], ptr addrspace(1) [[TMP49]], align 
16
+// CHECK-NEXT:[[TMP50:%.*]] = load <16 x half>, ptr 
[[SRCH16_ADDR_ASCAST]], align 32
+// CHECK-NEXT:[[TMP51:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
+// CHECK-NEXT:[[TMP52:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 
4
+// CHECK-NEXT:[[TMP53:%.*]] = call <3 x i32> 
@llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.f16(<16 x half> [[TMP50]], i32 [[TMP51]], 
float [[TMP52]])
+// CHECK-NEXT:[[TMP54:%.*]] = load ptr addrspace(1), ptr 
[[OUT3_ADDR_ASCAST]], align 8
+// CHECK-NEXT:store <3 x i32> [[TMP53]], ptr addrspace(1) [[TMP54]], align 
16
+// CHECK-NEXT:[[TMP55:%.*]] = load <16 x bfloat>, ptr 
[[SRCBF16_ADDR_ASCAST]], align 32
+// CHECK-NEXT:[[TMP56:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
+// CHECK-NEXT:[[TMP57:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 
4
+// CHECK-NEXT:[[TMP58:%.*]] = call <3 x i32> 
@llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.bf16(<16 x bfloat> [[TMP55]], i32 
[[TMP56]], float [[TMP57]])
+// CHECK-NEXT:[[TMP59:%.*]] = load ptr addrspace(1), ptr 
[[OUT3_ADDR_ASCAST]], align 8
+// CHECK-NEXT:store <3 x i32> [[TMP58]], ptr addrspace(1) [[TMP59]], align 
16
+// CHECK-NEXT:[[TMP60:%.*]] = load <16 x half>, ptr 
[[SRCH16_ADDR_ASCAST]], align 32
+// CHECK-NEXT:[[TMP6

[llvm-branch-commits] [clang] [llvm] [AMDGPU] v_cvt_scalef32_sr_pk16_* gfx1250 instructions (PR #151810)

2025-08-02 Thread via llvm-branch-commits

llvmbot wrote:



@llvm/pr-subscribers-llvm-ir

@llvm/pr-subscribers-backend-amdgpu

Author: Stanislav Mekhanoshin (rampitec)


Changes



---

Patch is 34.66 KiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/151810.diff


10 Files Affected:

- (modified) clang/include/clang/Basic/BuiltinsAMDGPU.def (+6) 
- (modified) clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl (+42) 
- (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+6) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+6) 
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.td (+3) 
- (modified) llvm/lib/Target/AMDGPU/VOP3Instructions.td (+12) 
- (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk16.ll (+232) 
- (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s (+36) 
- (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s (+36) 
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt (+36) 


``diff
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 9125315310306..ced758c814105 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -746,6 +746,12 @@ 
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_f32, "V2UiV8fUif", "nc",
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f32, "UiV8fUif", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f16, "UiV8hUif", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_bf16, "UiV8yUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_bf16, "V3UiV16yUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_f16, "V3UiV16hUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_f32, "V3UiV16fUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_bf16, "V3UiV16yUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_f16, "V3UiV16hUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_f32, "V3UiV16fUif", 
"nc", "gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32_e5m3, "iffiIb", "nc", 
"fp8e5m3-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32_e5m3, "ifiiIi", "nc", 
"fp8e5m3-insts")
 TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_i4_i8, "UsUi", "nc", "gfx1250-insts")
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index e50ab77f48c79..4ff0571239e71 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -929,6 +929,42 @@ void test_cvt_scalef32_pk(global uint2 *out2, bfloat8 
srcbf8, half8 srch8, float
 // CHECK-NEXT:[[TMP43:%.*]] = call i32 
@llvm.amdgcn.cvt.scalef32.sr.pk8.fp4.bf16(<8 x bfloat> [[TMP40]], i32 
[[TMP41]], float [[TMP42]])
 // CHECK-NEXT:[[TMP44:%.*]] = load ptr addrspace(1), ptr 
[[OUT1_ADDR_ASCAST]], align 8
 // CHECK-NEXT:store i32 [[TMP43]], ptr addrspace(1) [[TMP44]], align 4
+// CHECK-NEXT:[[TMP45:%.*]] = load <16 x bfloat>, ptr 
[[SRCBF16_ADDR_ASCAST]], align 32
+// CHECK-NEXT:[[TMP46:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
+// CHECK-NEXT:[[TMP47:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 
4
+// CHECK-NEXT:[[TMP48:%.*]] = call <3 x i32> 
@llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.bf16(<16 x bfloat> [[TMP45]], i32 
[[TMP46]], float [[TMP47]])
+// CHECK-NEXT:[[TMP49:%.*]] = load ptr addrspace(1), ptr 
[[OUT3_ADDR_ASCAST]], align 8
+// CHECK-NEXT:store <3 x i32> [[TMP48]], ptr addrspace(1) [[TMP49]], align 
16
+// CHECK-NEXT:[[TMP50:%.*]] = load <16 x half>, ptr 
[[SRCH16_ADDR_ASCAST]], align 32
+// CHECK-NEXT:[[TMP51:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
+// CHECK-NEXT:[[TMP52:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 
4
+// CHECK-NEXT:[[TMP53:%.*]] = call <3 x i32> 
@llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.f16(<16 x half> [[TMP50]], i32 [[TMP51]], 
float [[TMP52]])
+// CHECK-NEXT:[[TMP54:%.*]] = load ptr addrspace(1), ptr 
[[OUT3_ADDR_ASCAST]], align 8
+// CHECK-NEXT:store <3 x i32> [[TMP53]], ptr addrspace(1) [[TMP54]], align 
16
+// CHECK-NEXT:[[TMP55:%.*]] = load <16 x bfloat>, ptr 
[[SRCBF16_ADDR_ASCAST]], align 32
+// CHECK-NEXT:[[TMP56:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
+// CHECK-NEXT:[[TMP57:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 
4
+// CHECK-NEXT:[[TMP58:%.*]] = call <3 x i32> 
@llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.bf16(<16 x bfloat> [[TMP55]], i32 
[[TMP56]], float [[TMP57]])
+// CHECK-NEXT:[[TMP59:%.*]] = load ptr addrspace(1), ptr 
[[OUT3_ADDR_ASCAST]], align 8
+// CHECK-NEXT:store <3 x i32> [[TMP58]], ptr addrspace(1) [[TMP59]], align 
16
+// CHECK-NEXT:[[TMP60:%.*]] = load <16 x half>, ptr 
[[SRCH16_ADDR_ASCAST

[llvm-branch-commits] [clang] [llvm] [AMDGPU] v_cvt_scalef32_sr_pk16_* gfx1250 instructions (PR #151810)

2025-08-02 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-mc

Author: Stanislav Mekhanoshin (rampitec)


Changes



---

Patch is 34.66 KiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/151810.diff


10 Files Affected:

- (modified) clang/include/clang/Basic/BuiltinsAMDGPU.def (+6) 
- (modified) clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl (+42) 
- (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+6) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+6) 
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.td (+3) 
- (modified) llvm/lib/Target/AMDGPU/VOP3Instructions.td (+12) 
- (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk16.ll (+232) 
- (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s (+36) 
- (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s (+36) 
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt (+36) 


``diff
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 9125315310306..ced758c814105 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -746,6 +746,12 @@ 
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_f32, "V2UiV8fUif", "nc",
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f32, "UiV8fUif", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f16, "UiV8hUif", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_bf16, "UiV8yUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_bf16, "V3UiV16yUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_f16, "V3UiV16hUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_f32, "V3UiV16fUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_bf16, "V3UiV16yUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_f16, "V3UiV16hUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_f32, "V3UiV16fUif", 
"nc", "gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32_e5m3, "iffiIb", "nc", 
"fp8e5m3-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32_e5m3, "ifiiIi", "nc", 
"fp8e5m3-insts")
 TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_i4_i8, "UsUi", "nc", "gfx1250-insts")
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index e50ab77f48c79..4ff0571239e71 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -929,6 +929,42 @@ void test_cvt_scalef32_pk(global uint2 *out2, bfloat8 
srcbf8, half8 srch8, float
 // CHECK-NEXT:[[TMP43:%.*]] = call i32 
@llvm.amdgcn.cvt.scalef32.sr.pk8.fp4.bf16(<8 x bfloat> [[TMP40]], i32 
[[TMP41]], float [[TMP42]])
 // CHECK-NEXT:[[TMP44:%.*]] = load ptr addrspace(1), ptr 
[[OUT1_ADDR_ASCAST]], align 8
 // CHECK-NEXT:store i32 [[TMP43]], ptr addrspace(1) [[TMP44]], align 4
+// CHECK-NEXT:[[TMP45:%.*]] = load <16 x bfloat>, ptr 
[[SRCBF16_ADDR_ASCAST]], align 32
+// CHECK-NEXT:[[TMP46:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
+// CHECK-NEXT:[[TMP47:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 
4
+// CHECK-NEXT:[[TMP48:%.*]] = call <3 x i32> 
@llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.bf16(<16 x bfloat> [[TMP45]], i32 
[[TMP46]], float [[TMP47]])
+// CHECK-NEXT:[[TMP49:%.*]] = load ptr addrspace(1), ptr 
[[OUT3_ADDR_ASCAST]], align 8
+// CHECK-NEXT:store <3 x i32> [[TMP48]], ptr addrspace(1) [[TMP49]], align 
16
+// CHECK-NEXT:[[TMP50:%.*]] = load <16 x half>, ptr 
[[SRCH16_ADDR_ASCAST]], align 32
+// CHECK-NEXT:[[TMP51:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
+// CHECK-NEXT:[[TMP52:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 
4
+// CHECK-NEXT:[[TMP53:%.*]] = call <3 x i32> 
@llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.f16(<16 x half> [[TMP50]], i32 [[TMP51]], 
float [[TMP52]])
+// CHECK-NEXT:[[TMP54:%.*]] = load ptr addrspace(1), ptr 
[[OUT3_ADDR_ASCAST]], align 8
+// CHECK-NEXT:store <3 x i32> [[TMP53]], ptr addrspace(1) [[TMP54]], align 
16
+// CHECK-NEXT:[[TMP55:%.*]] = load <16 x bfloat>, ptr 
[[SRCBF16_ADDR_ASCAST]], align 32
+// CHECK-NEXT:[[TMP56:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
+// CHECK-NEXT:[[TMP57:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 
4
+// CHECK-NEXT:[[TMP58:%.*]] = call <3 x i32> 
@llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.bf16(<16 x bfloat> [[TMP55]], i32 
[[TMP56]], float [[TMP57]])
+// CHECK-NEXT:[[TMP59:%.*]] = load ptr addrspace(1), ptr 
[[OUT3_ADDR_ASCAST]], align 8
+// CHECK-NEXT:store <3 x i32> [[TMP58]], ptr addrspace(1) [[TMP59]], align 
16
+// CHECK-NEXT:[[TMP60:%.*]] = load <16 x half>, ptr 
[[SRCH16_ADDR_ASCAST]], align 32
+// CHECK-NEXT:[[TMP61:%

[llvm-branch-commits] [clang] [llvm] [AMDGPU] v_cvt_scalef32_sr_pk16_* gfx1250 instructions (PR #151810)

2025-08-02 Thread Stanislav Mekhanoshin via llvm-branch-commits

https://github.com/rampitec ready_for_review 
https://github.com/llvm/llvm-project/pull/151810
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] [llvm] [AMDGPU] v_cvt_scalef32_sr_pk16_* gfx1250 instructions (PR #151810)

2025-08-02 Thread Stanislav Mekhanoshin via llvm-branch-commits

https://github.com/rampitec updated 
https://github.com/llvm/llvm-project/pull/151810

>From dad0929b323eb7dde3211a43a4b14170fee5d56c Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin 
Date: Sat, 2 Aug 2025 02:11:34 -0700
Subject: [PATCH] [AMDGPU] v_cvt_scalef32_sr_pk16_* gfx1250 instructions

---
 clang/include/clang/Basic/BuiltinsAMDGPU.def  |   6 +
 .../CodeGenOpenCL/builtins-amdgcn-gfx1250.cl  |  42 
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |   6 +
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |   6 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.td |   3 +
 llvm/lib/Target/AMDGPU/VOP3Instructions.td|  12 +
 .../llvm.amdgcn.cvt.scalef32.sr.pk16.ll   | 232 ++
 llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s |  36 +++
 llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s|  36 +++
 .../Disassembler/AMDGPU/gfx1250_dasm_vop3.txt |  36 +++
 10 files changed, 415 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk16.ll

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 9125315310306..ced758c814105 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -746,6 +746,12 @@ 
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_f32, "V2UiV8fUif", "nc",
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f32, "UiV8fUif", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f16, "UiV8hUif", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_bf16, "UiV8yUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_bf16, "V3UiV16yUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_f16, "V3UiV16hUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_f32, "V3UiV16fUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_bf16, "V3UiV16yUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_f16, "V3UiV16hUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_f32, "V3UiV16fUif", 
"nc", "gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32_e5m3, "iffiIb", "nc", 
"fp8e5m3-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32_e5m3, "ifiiIi", "nc", 
"fp8e5m3-insts")
 TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_i4_i8, "UsUi", "nc", "gfx1250-insts")
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index e50ab77f48c79..4ff0571239e71 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -929,6 +929,42 @@ void test_cvt_scalef32_pk(global uint2 *out2, bfloat8 
srcbf8, half8 srch8, float
 // CHECK-NEXT:[[TMP43:%.*]] = call i32 
@llvm.amdgcn.cvt.scalef32.sr.pk8.fp4.bf16(<8 x bfloat> [[TMP40]], i32 
[[TMP41]], float [[TMP42]])
 // CHECK-NEXT:[[TMP44:%.*]] = load ptr addrspace(1), ptr 
[[OUT1_ADDR_ASCAST]], align 8
 // CHECK-NEXT:store i32 [[TMP43]], ptr addrspace(1) [[TMP44]], align 4
+// CHECK-NEXT:[[TMP45:%.*]] = load <16 x bfloat>, ptr 
[[SRCBF16_ADDR_ASCAST]], align 32
+// CHECK-NEXT:[[TMP46:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
+// CHECK-NEXT:[[TMP47:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 
4
+// CHECK-NEXT:[[TMP48:%.*]] = call <3 x i32> 
@llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.bf16(<16 x bfloat> [[TMP45]], i32 
[[TMP46]], float [[TMP47]])
+// CHECK-NEXT:[[TMP49:%.*]] = load ptr addrspace(1), ptr 
[[OUT3_ADDR_ASCAST]], align 8
+// CHECK-NEXT:store <3 x i32> [[TMP48]], ptr addrspace(1) [[TMP49]], align 
16
+// CHECK-NEXT:[[TMP50:%.*]] = load <16 x half>, ptr 
[[SRCH16_ADDR_ASCAST]], align 32
+// CHECK-NEXT:[[TMP51:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
+// CHECK-NEXT:[[TMP52:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 
4
+// CHECK-NEXT:[[TMP53:%.*]] = call <3 x i32> 
@llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.f16(<16 x half> [[TMP50]], i32 [[TMP51]], 
float [[TMP52]])
+// CHECK-NEXT:[[TMP54:%.*]] = load ptr addrspace(1), ptr 
[[OUT3_ADDR_ASCAST]], align 8
+// CHECK-NEXT:store <3 x i32> [[TMP53]], ptr addrspace(1) [[TMP54]], align 
16
+// CHECK-NEXT:[[TMP55:%.*]] = load <16 x bfloat>, ptr 
[[SRCBF16_ADDR_ASCAST]], align 32
+// CHECK-NEXT:[[TMP56:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
+// CHECK-NEXT:[[TMP57:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 
4
+// CHECK-NEXT:[[TMP58:%.*]] = call <3 x i32> 
@llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.bf16(<16 x bfloat> [[TMP55]], i32 
[[TMP56]], float [[TMP57]])
+// CHECK-NEXT:[[TMP59:%.*]] = load ptr addrspace(1), ptr 
[[OUT3_ADDR_ASCAST]], align 8
+// CHECK-NEXT:store <3 x i32> [[TMP58]], ptr addrspace(1) [[TMP59]], align 
16
+// CHECK-NEXT:[[TMP60:%.*]] = load <16 x h

[llvm-branch-commits] [clang] [llvm] [AMDGPU] v_cvt_scalef32_sr_pk16_* gfx1250 instructions (PR #151810)

2025-08-02 Thread Stanislav Mekhanoshin via llvm-branch-commits

https://github.com/rampitec updated 
https://github.com/llvm/llvm-project/pull/151810

>From dad0929b323eb7dde3211a43a4b14170fee5d56c Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin 
Date: Sat, 2 Aug 2025 02:11:34 -0700
Subject: [PATCH] [AMDGPU] v_cvt_scalef32_sr_pk16_* gfx1250 instructions

---
 clang/include/clang/Basic/BuiltinsAMDGPU.def  |   6 +
 .../CodeGenOpenCL/builtins-amdgcn-gfx1250.cl  |  42 
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |   6 +
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |   6 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.td |   3 +
 llvm/lib/Target/AMDGPU/VOP3Instructions.td|  12 +
 .../llvm.amdgcn.cvt.scalef32.sr.pk16.ll   | 232 ++
 llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s |  36 +++
 llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s|  36 +++
 .../Disassembler/AMDGPU/gfx1250_dasm_vop3.txt |  36 +++
 10 files changed, 415 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk16.ll

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 9125315310306..ced758c814105 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -746,6 +746,12 @@ 
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_f32, "V2UiV8fUif", "nc",
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f32, "UiV8fUif", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f16, "UiV8hUif", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_bf16, "UiV8yUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_bf16, "V3UiV16yUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_f16, "V3UiV16hUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_f32, "V3UiV16fUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_bf16, "V3UiV16yUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_f16, "V3UiV16hUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_f32, "V3UiV16fUif", 
"nc", "gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32_e5m3, "iffiIb", "nc", 
"fp8e5m3-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32_e5m3, "ifiiIi", "nc", 
"fp8e5m3-insts")
 TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_i4_i8, "UsUi", "nc", "gfx1250-insts")
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index e50ab77f48c79..4ff0571239e71 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -929,6 +929,42 @@ void test_cvt_scalef32_pk(global uint2 *out2, bfloat8 
srcbf8, half8 srch8, float
 // CHECK-NEXT:[[TMP43:%.*]] = call i32 
@llvm.amdgcn.cvt.scalef32.sr.pk8.fp4.bf16(<8 x bfloat> [[TMP40]], i32 
[[TMP41]], float [[TMP42]])
 // CHECK-NEXT:[[TMP44:%.*]] = load ptr addrspace(1), ptr 
[[OUT1_ADDR_ASCAST]], align 8
 // CHECK-NEXT:store i32 [[TMP43]], ptr addrspace(1) [[TMP44]], align 4
+// CHECK-NEXT:[[TMP45:%.*]] = load <16 x bfloat>, ptr 
[[SRCBF16_ADDR_ASCAST]], align 32
+// CHECK-NEXT:[[TMP46:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
+// CHECK-NEXT:[[TMP47:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 
4
+// CHECK-NEXT:[[TMP48:%.*]] = call <3 x i32> 
@llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.bf16(<16 x bfloat> [[TMP45]], i32 
[[TMP46]], float [[TMP47]])
+// CHECK-NEXT:[[TMP49:%.*]] = load ptr addrspace(1), ptr 
[[OUT3_ADDR_ASCAST]], align 8
+// CHECK-NEXT:store <3 x i32> [[TMP48]], ptr addrspace(1) [[TMP49]], align 
16
+// CHECK-NEXT:[[TMP50:%.*]] = load <16 x half>, ptr 
[[SRCH16_ADDR_ASCAST]], align 32
+// CHECK-NEXT:[[TMP51:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
+// CHECK-NEXT:[[TMP52:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 
4
+// CHECK-NEXT:[[TMP53:%.*]] = call <3 x i32> 
@llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.f16(<16 x half> [[TMP50]], i32 [[TMP51]], 
float [[TMP52]])
+// CHECK-NEXT:[[TMP54:%.*]] = load ptr addrspace(1), ptr 
[[OUT3_ADDR_ASCAST]], align 8
+// CHECK-NEXT:store <3 x i32> [[TMP53]], ptr addrspace(1) [[TMP54]], align 
16
+// CHECK-NEXT:[[TMP55:%.*]] = load <16 x bfloat>, ptr 
[[SRCBF16_ADDR_ASCAST]], align 32
+// CHECK-NEXT:[[TMP56:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
+// CHECK-NEXT:[[TMP57:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 
4
+// CHECK-NEXT:[[TMP58:%.*]] = call <3 x i32> 
@llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.bf16(<16 x bfloat> [[TMP55]], i32 
[[TMP56]], float [[TMP57]])
+// CHECK-NEXT:[[TMP59:%.*]] = load ptr addrspace(1), ptr 
[[OUT3_ADDR_ASCAST]], align 8
+// CHECK-NEXT:store <3 x i32> [[TMP58]], ptr addrspace(1) [[TMP59]], align 
16
+// CHECK-NEXT:[[TMP60:%.*]] = load <16 x h

[llvm-branch-commits] [clang] [llvm] [AMDGPU] v_cvt_scalef32_sr_pk16_* gfx1250 instructions (PR #151810)

2025-08-02 Thread Stanislav Mekhanoshin via llvm-branch-commits

rampitec wrote:

> [!WARNING]
> This pull request is not mergeable via GitHub because a downstack PR is 
> open. Once all requirements are satisfied, merge this PR as a stack  href="https://app.graphite.dev/github/pr/llvm/llvm-project/151810?utm_source=stack-comment-downstack-mergeability-warning";
>  >on Graphite.
> https://graphite.dev/docs/merge-pull-requests";>Learn more

* **#151810** https://app.graphite.dev/github/pr/llvm/llvm-project/151810?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/151810?utm_source=stack-comment-view-in-graphite";
 target="_blank">(View in Graphite)
* **#151807** https://app.graphite.dev/github/pr/llvm/llvm-project/151807?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#151804** https://app.graphite.dev/github/pr/llvm/llvm-project/151804?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* `main`




This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn 
more about https://stacking.dev/?utm_source=stack-comment";>stacking.


https://github.com/llvm/llvm-project/pull/151810
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] [llvm] [AMDGPU] v_cvt_scalef32_sr_pk16_* gfx1250 instructions (PR #151810)

2025-08-02 Thread Stanislav Mekhanoshin via llvm-branch-commits

https://github.com/rampitec created 
https://github.com/llvm/llvm-project/pull/151810

None

>From dfe439a1ed94031e238a3acd558cb0035b74e97b Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin 
Date: Sat, 2 Aug 2025 02:11:34 -0700
Subject: [PATCH] [AMDGPU] v_cvt_scalef32_sr_pk16_* gfx1250 instructions

---
 clang/include/clang/Basic/BuiltinsAMDGPU.def  |   6 +
 .../CodeGenOpenCL/builtins-amdgcn-gfx1250.cl  |  42 
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |   6 +
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |   6 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.td |   3 +
 llvm/lib/Target/AMDGPU/VOP3Instructions.td|  12 +
 .../llvm.amdgcn.cvt.scalef32.sr.pk16.ll   | 232 ++
 llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s |  36 +++
 llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s|  36 +++
 .../Disassembler/AMDGPU/gfx1250_dasm_vop3.txt |  36 +++
 10 files changed, 415 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk16.ll

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 9125315310306..ced758c814105 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -746,6 +746,12 @@ 
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_f32, "V2UiV8fUif", "nc",
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f32, "UiV8fUif", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f16, "UiV8hUif", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_bf16, "UiV8yUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_bf16, "V3UiV16yUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_f16, "V3UiV16hUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_f32, "V3UiV16fUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_bf16, "V3UiV16yUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_f16, "V3UiV16hUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_f32, "V3UiV16fUif", 
"nc", "gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32_e5m3, "iffiIb", "nc", 
"fp8e5m3-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32_e5m3, "ifiiIi", "nc", 
"fp8e5m3-insts")
 TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_i4_i8, "UsUi", "nc", "gfx1250-insts")
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index e50ab77f48c79..4ff0571239e71 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -929,6 +929,42 @@ void test_cvt_scalef32_pk(global uint2 *out2, bfloat8 
srcbf8, half8 srch8, float
 // CHECK-NEXT:[[TMP43:%.*]] = call i32 
@llvm.amdgcn.cvt.scalef32.sr.pk8.fp4.bf16(<8 x bfloat> [[TMP40]], i32 
[[TMP41]], float [[TMP42]])
 // CHECK-NEXT:[[TMP44:%.*]] = load ptr addrspace(1), ptr 
[[OUT1_ADDR_ASCAST]], align 8
 // CHECK-NEXT:store i32 [[TMP43]], ptr addrspace(1) [[TMP44]], align 4
+// CHECK-NEXT:[[TMP45:%.*]] = load <16 x bfloat>, ptr 
[[SRCBF16_ADDR_ASCAST]], align 32
+// CHECK-NEXT:[[TMP46:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
+// CHECK-NEXT:[[TMP47:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 
4
+// CHECK-NEXT:[[TMP48:%.*]] = call <3 x i32> 
@llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.bf16(<16 x bfloat> [[TMP45]], i32 
[[TMP46]], float [[TMP47]])
+// CHECK-NEXT:[[TMP49:%.*]] = load ptr addrspace(1), ptr 
[[OUT3_ADDR_ASCAST]], align 8
+// CHECK-NEXT:store <3 x i32> [[TMP48]], ptr addrspace(1) [[TMP49]], align 
16
+// CHECK-NEXT:[[TMP50:%.*]] = load <16 x half>, ptr 
[[SRCH16_ADDR_ASCAST]], align 32
+// CHECK-NEXT:[[TMP51:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
+// CHECK-NEXT:[[TMP52:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 
4
+// CHECK-NEXT:[[TMP53:%.*]] = call <3 x i32> 
@llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.f16(<16 x half> [[TMP50]], i32 [[TMP51]], 
float [[TMP52]])
+// CHECK-NEXT:[[TMP54:%.*]] = load ptr addrspace(1), ptr 
[[OUT3_ADDR_ASCAST]], align 8
+// CHECK-NEXT:store <3 x i32> [[TMP53]], ptr addrspace(1) [[TMP54]], align 
16
+// CHECK-NEXT:[[TMP55:%.*]] = load <16 x bfloat>, ptr 
[[SRCBF16_ADDR_ASCAST]], align 32
+// CHECK-NEXT:[[TMP56:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
+// CHECK-NEXT:[[TMP57:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 
4
+// CHECK-NEXT:[[TMP58:%.*]] = call <3 x i32> 
@llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.bf16(<16 x bfloat> [[TMP55]], i32 
[[TMP56]], float [[TMP57]])
+// CHECK-NEXT:[[TMP59:%.*]] = load ptr addrspace(1), ptr 
[[OUT3_ADDR_ASCAST]], align 8
+// CHECK-NEXT:store <3 x i32> [[TMP58]], ptr addrspace(1) [[TMP59]], align 
16
+// CHECK-NEXT:[[TMP60:%.*]] = load <