https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/149628
>From 622cf016078431b101f600b0ead6fbd81c133ea4 Mon Sep 17 00:00:00 2001 From: Shilei Tian <i...@tianshilei.me> Date: Fri, 18 Jul 2025 21:35:56 -0400 Subject: [PATCH] [SDAG] Lower unsafe bf16 divisions for gfx1250 Co-authored-by: Kosarev, Ivan <ivan.kosa...@amd.com> --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 13 +- llvm/test/CodeGen/AMDGPU/fdiv.bf16.ll | 298 ++++++++++++++++++++++ 2 files changed, 307 insertions(+), 4 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/fdiv.bf16.ll diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index d4e3fa71ada85..03cebc8ed224d 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -618,6 +618,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, ISD::FSIN, ISD::FROUND}, MVT::f16, Custom); + // BF16 - VOP1 Actions. + if (Subtarget->hasBF16TransInsts()) + setOperationAction(ISD::FDIV, MVT::bf16, Custom); + setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::f16, Promote); setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::bf16, Promote); @@ -11200,7 +11204,7 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op, // Without !fpmath accuracy information, we can't do more because we don't // know exactly whether rcp is accurate enough to meet !fpmath requirement. // f16 is always accurate enough - if (!AllowInaccurateRcp && VT != MVT::f16) + if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16) return SDValue(); if (CLHS->isExactlyValue(1.0)) { @@ -11227,9 +11231,10 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op, } } - // For f16 require afn or arcp. + // For f16 and bf16 require afn or arcp. // For f32 require afn. - if (!AllowInaccurateRcp && (VT != MVT::f16 || !Flags.hasAllowReciprocal())) + if (!AllowInaccurateRcp && + ((VT != MVT::f16 && VT != MVT::bf16) || !Flags.hasAllowReciprocal())) return SDValue(); // Turn into multiply by the reciprocal. @@ -11620,7 +11625,7 @@ SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const { if (VT == MVT::f64) return LowerFDIV64(Op, DAG); - if (VT == MVT::f16) + if (VT == MVT::f16 || VT == MVT::bf16) return LowerFDIV16(Op, DAG); llvm_unreachable("Unexpected type for fdiv"); diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.bf16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.bf16.ll new file mode 100644 index 0000000000000..01ebe7d71428b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fdiv.bf16.ll @@ -0,0 +1,298 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GFX1250-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GFX1250-FAKE16 %s + +/* TODO: Support safe bf16 fdiv lowering. +define bfloat @v_fdiv_bf16(bfloat %x, bfloat %y) { + %fdiv = fdiv bfloat %x, %y + ret bfloat %fdiv +} +*/ + +define bfloat @v_rcp_bf16(bfloat %x) { +; GFX1250-TRUE16-LABEL: v_rcp_bf16: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v0.l, v0.l +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: v_rcp_bf16: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v0, v0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %fdiv = fdiv bfloat 1.0, %x + ret bfloat %fdiv +} + +define bfloat @v_rcp_bf16_abs(bfloat %x) { +; GFX1250-TRUE16-LABEL: v_rcp_bf16_abs: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_rcp_bf16_e64 v0.l, |v0.l| +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: v_rcp_bf16_abs: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_rcp_bf16_e64 v0, |v0| +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %fabs = call bfloat @llvm.fabs.bf16(bfloat %x) + %fdiv = fdiv bfloat 1.0, %fabs + ret bfloat %fdiv +} + +define bfloat @v_rcp_bf16_afn(bfloat %x) { +; GFX1250-TRUE16-LABEL: v_rcp_bf16_afn: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v0.l, v0.l +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: v_rcp_bf16_afn: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v0, v0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %fdiv = fdiv afn bfloat 1.0, %x + ret bfloat %fdiv +} + +define bfloat @v_rcp_bf16_neg(bfloat %x) { +; GFX1250-TRUE16-LABEL: v_rcp_bf16_neg: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_rcp_bf16_e64 v0.l, -v0.l +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: v_rcp_bf16_neg: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_rcp_bf16_e64 v0, -v0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %fdiv = fdiv bfloat -1.0, %x + ret bfloat %fdiv +} + +; TODO: Support lowering to v_rsq_bf16. +define bfloat @v_rsq_bf16(bfloat %x) { +; GFX1250-TRUE16-LABEL: v_rsq_bf16: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) +; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v0.l, v0.l +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: v_rsq_bf16: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) +; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v0, v0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %sqrt = call contract bfloat @llvm.sqrt.bf16(bfloat %x) + %fdiv = fdiv contract bfloat 1.0, %sqrt + ret bfloat %fdiv +} + +; TODO: Support lowering to v_rsq_bf16. +define bfloat @v_rsq_bf16_neg(bfloat %x) { +; GFX1250-TRUE16-LABEL: v_rsq_bf16_neg: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) +; GFX1250-TRUE16-NEXT: v_rcp_bf16_e64 v0.l, -v0.l +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: v_rsq_bf16_neg: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) +; GFX1250-FAKE16-NEXT: v_rcp_bf16_e64 v0, -v0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %sqrt = call contract bfloat @llvm.sqrt.bf16(bfloat %x) + %fdiv = fdiv contract bfloat -1.0, %sqrt + ret bfloat %fdiv +} + +; TODO: Support lowering to v_rsq_bf16. +define <2 x bfloat> @v_rsq_bf16_multi_use(bfloat %x) { +; GFX1250-TRUE16-LABEL: v_rsq_bf16_multi_use: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v1.l, v1.l +; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v1.h, v1.l +; GFX1250-TRUE16-NEXT: v_nop +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v0, v1 +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: v_rsq_bf16_multi_use: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v1, v0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) +; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v1, v1 +; GFX1250-FAKE16-NEXT: v_nop +; GFX1250-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %sqrt = call contract bfloat @llvm.sqrt.bf16(bfloat %x) + %fdiv = fdiv contract bfloat 1.0, %sqrt + %r = insertelement <2 x bfloat> zeroinitializer, bfloat %x, i32 0 + %r2 = insertelement <2 x bfloat> %r, bfloat %fdiv, i32 1 + ret <2 x bfloat> %r2 +} + +; TODO: Support lowering to v_rsq_bf16. +define bfloat @v_rsq_bf16_missing_contract0(bfloat %x) { +; GFX1250-TRUE16-LABEL: v_rsq_bf16_missing_contract0: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) +; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v0.l, v0.l +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: v_rsq_bf16_missing_contract0: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) +; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v0, v0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %sqrt = call bfloat @llvm.sqrt.bf16(bfloat %x) + %fdiv = fdiv contract bfloat 1.0, %sqrt + ret bfloat %fdiv +} + +; TODO: Support lowering to v_rsq_bf16. +define bfloat @v_rsq_bf16_missing_contract1(bfloat %x) { +; GFX1250-TRUE16-LABEL: v_rsq_bf16_missing_contract1: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) +; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v0.l, v0.l +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: v_rsq_bf16_missing_contract1: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) +; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v0, v0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %sqrt = call contract bfloat @llvm.sqrt.bf16(bfloat %x) + %fdiv = fdiv bfloat 1.0, %sqrt + ret bfloat %fdiv +} + +; TODO: Support lowering to v_rsq_bf16. +define bfloat @v_neg_rsq_bf16_missing_contract1(bfloat %x) { +; GFX1250-TRUE16-LABEL: v_neg_rsq_bf16_missing_contract1: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) +; GFX1250-TRUE16-NEXT: v_rcp_bf16_e64 v0.l, -v0.l +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: v_neg_rsq_bf16_missing_contract1: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) +; GFX1250-FAKE16-NEXT: v_rcp_bf16_e64 v0, -v0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %sqrt = call contract bfloat @llvm.sqrt.bf16(bfloat %x) + %fdiv = fdiv bfloat -1.0, %sqrt + ret bfloat %fdiv +} + +define <2 x bfloat> @v_rsq_v2bf16(<2 x bfloat> %a) { +; GFX1250-TRUE16-LABEL: v_rsq_v2bf16: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.h, v0.h +; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_2) +; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v0.h, v0.h +; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v0.l, v0.l +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: v_rsq_v2bf16: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_2) +; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v1, v1 +; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v0, v0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) +; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v1, v1 +; GFX1250-FAKE16-NEXT: v_nop +; GFX1250-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %sqrt = call contract <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> %a) + %fdiv = fdiv contract <2 x bfloat> <bfloat 1.0, bfloat 1.0>, %sqrt + ret <2 x bfloat> %fdiv +} + +define <2 x bfloat> @v_neg_rsq_v2bf16(<2 x bfloat> %a) { +; GFX1250-TRUE16-LABEL: v_neg_rsq_v2bf16: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.h, v0.h +; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_2) +; GFX1250-TRUE16-NEXT: v_rcp_bf16_e64 v0.h, -v0.h +; GFX1250-TRUE16-NEXT: v_rcp_bf16_e64 v0.l, -v0.l +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: v_neg_rsq_v2bf16: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_2) +; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v1, v1 +; GFX1250-FAKE16-NEXT: v_rcp_bf16_e64 v0, -v0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) +; GFX1250-FAKE16-NEXT: v_rcp_bf16_e64 v1, -v1 +; GFX1250-FAKE16-NEXT: v_nop +; GFX1250-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %sqrt = call contract <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> %a) + %fdiv = fdiv contract <2 x bfloat> <bfloat -1.0, bfloat -1.0>, %sqrt + ret <2 x bfloat> %fdiv +} _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits