https://github.com/shiltian updated
https://github.com/llvm/llvm-project/pull/149450
>From 487881ca26a91b76e24e10e152aa98fedf19414a Mon Sep 17 00:00:00 2001
From: Shilei Tian
Date: Fri, 18 Jul 2025 00:26:15 -0400
Subject: [PATCH] [AMDGPU] Add support for `v_prng_b32` on gfx1250
Co-authored-by: Mekhanoshin, Stanislav
---
.../CodeGenOpenCL/builtins-amdgcn-gfx1250.cl | 19 +++
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 3 +-
llvm/lib/Target/AMDGPU/VOP1Instructions.td| 1 +
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.prng.ll | 11 +++-
llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s | 45
llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s| 45
.../MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s | 52 +++
llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s | 52 +++
.../MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s | 12 +
llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s | 12 +
.../gfx1250_asm_vop3_from_vop1-fake16.s | 36 +
.../MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s| 36 +
.../gfx1250_asm_vop3_from_vop1_dpp16-fake16.s | 44
.../AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s | 44
.../gfx1250_asm_vop3_from_vop1_dpp8-fake16.s | 4 ++
.../AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s | 4 ++
.../Disassembler/AMDGPU/gfx1250_dasm_vop1.txt | 45
.../AMDGPU/gfx1250_dasm_vop1_dpp16.txt| 39 ++
.../AMDGPU/gfx1250_dasm_vop1_dpp8.txt | 9
.../AMDGPU/gfx1250_dasm_vop3_from_vop1.txt| 36 +
.../gfx1250_dasm_vop3_from_vop1_dpp16.txt | 33
.../gfx1250_dasm_vop3_from_vop1_dpp8.txt | 7 ++-
22 files changed, 585 insertions(+), 4 deletions(-)
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index a9ea17642d6ad..d42e51d04ab9d 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -4,6 +4,7 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+typedef unsigned int uint;
typedef half __attribute__((ext_vector_type(2))) half2;
// CHECK-LABEL: @test_setprio_inc_wg(
@@ -42,6 +43,24 @@ void test_s_wait_tensorcnt() {
__builtin_amdgcn_s_wait_tensorcnt(0);
}
+// CHECK-LABEL: @test_prng_b32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT:[[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8,
addrspace(5)
+// CHECK-NEXT:[[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:[[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5)
[[OUT_ADDR]] to ptr
+// CHECK-NEXT:[[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5)
[[A_ADDR]] to ptr
+// CHECK-NEXT:store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]],
align 8
+// CHECK-NEXT:store i32 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT:[[TMP0:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT:[[TMP1:%.*]] = call i32 @llvm.amdgcn.prng.b32(i32 [[TMP0]])
+// CHECK-NEXT:[[TMP2:%.*]] = load ptr addrspace(1), ptr
[[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:store i32 [[TMP1]], ptr addrspace(1) [[TMP2]], align 4
+// CHECK-NEXT:ret void
+//
+void test_prng_b32(global uint* out, uint a) {
+ *out = __builtin_amdgcn_prng_b32(a);
+}
+
// CHECK-LABEL: @test_tanh_f32(
// CHECK-NEXT: entry:
// CHECK-NEXT:[[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8,
addrspace(5)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 3d040fb705a8d..84e4fa1dc84aa 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4007,7 +4007,8 @@ SDValue
AMDGPUTargetLowering::performIntrinsicWOChainCombine(
case Intrinsic::amdgcn_rcp_legacy:
case Intrinsic::amdgcn_rsq_legacy:
case Intrinsic::amdgcn_rsq_clamp:
- case Intrinsic::amdgcn_tanh: {
+ case Intrinsic::amdgcn_tanh:
+ case Intrinsic::amdgcn_prng_b32: {
// FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
SDValue Src = N->getOperand(1);
return Src.isUndef() ? Src : SDValue();
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 1610305e9..3ee90857b34b8 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -1148,6 +1148,7 @@ defm V_MOV_B64 : VOP1_Real_FULL ;
defm V_TANH_F32 : VOP1_Real_FULL;
defm V_TANH_F16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x01f>;
defm V_TANH_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x04a>;
+defm V_PRNG_B32 : VOP1_Real_FULL;
defm V_CVT_F32_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x072,
"v_cvt_f32_bf16", "V_CVT_F32_BF16_gfx1250">;
defm V_CVT_PK_F16_FP8: VOP1_Real_FULL_t16_and_fake16_gfx1250<0x075>;
defm V_CVT_PK_F16_BF8: VOP1_Real_FULL_t1