https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/179225
>From 97f1c601c0de004b0bd23a767a6315103952a5e7 Mon Sep 17 00:00:00 2001 From: Petar Avramovic <[email protected]> Date: Thu, 19 Mar 2026 12:54:42 +0100 Subject: [PATCH] AMDGPU: Improve codegen for VOP2 v_dot2c_f32_f16/bf16 Select VOP2 version when there are no src_modifers, otherwise VOP3 --- llvm/lib/Target/AMDGPU/AMDGPUGISel.td | 8 + llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 22 ++ llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h | 2 + .../AMDGPU/AMDGPUInstructionSelector.cpp | 43 ++- .../Target/AMDGPU/AMDGPUInstructionSelector.h | 5 + llvm/lib/Target/AMDGPU/SIInstrInfo.td | 2 + llvm/lib/Target/AMDGPU/VOP2Instructions.td | 26 +- .../AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll | 61 +--- .../AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll | 68 ++--- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll | 276 ++++-------------- 10 files changed, 188 insertions(+), 325 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index de8722841d3fe2..51a8a476bbf7e3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -51,10 +51,18 @@ def gi_vop3pmodsdot : GIComplexOperandMatcher<s32, "selectVOP3PModsDOT">, GIComplexPatternEquiv<VOP3PModsDOT>; +def gi_vop3pnomodsdot : + GIComplexOperandMatcher<s32, "selectVOP3PNoModsDOT">, + GIComplexPatternEquiv<VOP3PNoModsDOT>; + def gi_vop3pmodsf32 : GIComplexOperandMatcher<s32, "selectVOP3PModsF32">, GIComplexPatternEquiv<VOP3PModsF32>; +def gi_vop3pnomodsf32 : + GIComplexOperandMatcher<s32, "selectVOP3PNoModsF32">, + GIComplexPatternEquiv<VOP3PNoModsF32>; + def gi_wmmaopselvop3pmods : GIComplexOperandMatcher<s32, "selectWMMAOpSelVOP3PMods">, GIComplexPatternEquiv<WMMAOpSelVOP3PMods>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 613dcfeb646a2d..0cb59faf3a4578 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -3691,6 +3691,17 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src, return SelectVOP3PMods(In, Src, SrcMods, true); } +bool AMDGPUDAGToDAGISel::SelectVOP3PNoModsDOT(SDValue In, SDValue &Src) const { + SDValue SrcTmp, SrcModsTmp; + SelectVOP3PMods(In, SrcTmp, SrcModsTmp, true); + if (cast<ConstantSDNode>(SrcModsTmp)->getZExtValue() == SISrcMods::OP_SEL_1) { + Src = SrcTmp; + return true; + } + + return false; +} + bool AMDGPUDAGToDAGISel::SelectVOP3PModsF32(SDValue In, SDValue &Src, SDValue &SrcMods) const { SelectVOP3Mods(In, Src, SrcMods); @@ -3700,6 +3711,17 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PModsF32(SDValue In, SDValue &Src, return true; } +bool AMDGPUDAGToDAGISel::SelectVOP3PNoModsF32(SDValue In, SDValue &Src) const { + SDValue SrcTmp, SrcModsTmp; + SelectVOP3PModsF32(In, SrcTmp, SrcModsTmp); + if (cast<ConstantSDNode>(SrcModsTmp)->getZExtValue() == SISrcMods::OP_SEL_1) { + Src = SrcTmp; + return true; + } + + return false; +} + bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In, SDValue &Src) const { const ConstantSDNode *C = cast<ConstantSDNode>(In); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index 8b12d1d2a800f9..527923698eac21 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -233,7 +233,9 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel { bool SelectVOP3PMods(SDValue In, SDValue &Src, SDValue &SrcMods, bool IsDOT = false) const; bool SelectVOP3PModsDOT(SDValue In, SDValue &Src, SDValue &SrcMods) const; + bool SelectVOP3PNoModsDOT(SDValue In, SDValue &Src) const; bool SelectVOP3PModsF32(SDValue In, SDValue &Src, SDValue &SrcMods) const; + bool SelectVOP3PNoModsF32(SDValue In, SDValue &Src) const; bool SelectWMMAOpSelVOP3PMods(SDValue In, SDValue &Src) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 80b30b98ab5906..54f56419a4ba97 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -4563,6 +4563,17 @@ std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl( return std::pair(Src, Mods); } +std::pair<Register, unsigned> +AMDGPUInstructionSelector::selectVOP3PModsF32Impl(Register Src) const { + unsigned Mods = SISrcMods::OP_SEL_1; + if (Subtarget->isGFX11Plus()) { + unsigned ModsImpl; + std::tie(Src, ModsImpl) = selectVOP3ModsImpl(Src); + Mods |= ModsImpl; + } + return std::pair(Src, Mods); +} + Register AMDGPUInstructionSelector::copyToVGPRIfSrcFolded( Register Src, unsigned Mods, MachineOperand Root, MachineInstr *InsertPt, bool ForceVGPR) const { @@ -5270,18 +5281,42 @@ AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const { } InstructionSelector::ComplexRendererFns -AMDGPUInstructionSelector::selectVOP3PModsF32(MachineOperand &Root) const { +AMDGPUInstructionSelector::selectVOP3PNoModsDOT(MachineOperand &Root) const { + MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); Register Src; unsigned Mods; - std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg()); - Mods |= SISrcMods::OP_SEL_1; + std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, true /*IsDOT*/); + if (Mods != SISrcMods::OP_SEL_1) + return {}; + return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }}}; +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectVOP3PModsF32(MachineOperand &Root) const { + MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); + Register Reg; + unsigned Mods; + std::tie(Reg, Mods) = selectVOP3PModsF32Impl(Root.getReg()); + + Reg = getLegalRegBank(Reg, Root.getReg(), RBI, MRI, TRI, TII); return {{ - [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, + [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods }}; } +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectVOP3PNoModsF32(MachineOperand &Root) const { + Register Src; + unsigned Mods; + std::tie(Src, Mods) = selectVOP3PModsF32Impl(Root.getReg()); + if (Mods != SISrcMods::OP_SEL_1) + return {}; + + return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }}}; +} + InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods( MachineOperand &Root) const { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index 2c9ecc207d8bd1..7a7a6882629340 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -163,6 +163,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector { bool IsCanonicalizing = true, bool AllowAbs = true, bool OpSel = false) const; + std::pair<Register, unsigned> selectVOP3PModsF32Impl(Register Src) const; Register copyToVGPRIfSrcFolded(Register Src, unsigned Mods, MachineOperand Root, MachineInstr *InsertPt, @@ -201,7 +202,11 @@ class AMDGPUInstructionSelector final : public InstructionSelector { InstructionSelector::ComplexRendererFns selectVOP3PModsDOT(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns + selectVOP3PNoModsDOT(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns selectVOP3PModsF32(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectVOP3PNoModsF32(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns selectWMMAOpSelVOP3PMods(MachineOperand &Root) const; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 229cac30d41650..2ec7edbb0dcec4 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1706,7 +1706,9 @@ def VOP3OMods : ComplexPattern<untyped, 3, "SelectVOP3OMods">; def VOP3PMods : ComplexPattern<untyped, 2, "SelectVOP3PMods">; def VOP3PModsDOT : ComplexPattern<untyped, 2, "SelectVOP3PModsDOT">; +def VOP3PNoModsDOT : ComplexPattern<untyped, 1, "SelectVOP3PNoModsDOT">; def VOP3PModsF32 : ComplexPattern<untyped, 2, "SelectVOP3PModsF32">; +def VOP3PNoModsF32 : ComplexPattern<untyped, 1, "SelectVOP3PNoModsF32">; def WMMAOpSelVOP3PMods : ComplexPattern<untyped, 1, "SelectWMMAOpSelVOP3PMods">; diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index 62c1e7d67e6cde..0753270fc32a74 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -1300,19 +1300,21 @@ let Constraints = "$vdst = $src2", defm V_DOT2C_F32_BF16 : VOP2Inst_VOPD<"v_dot2c_f32_bf16", VOP_DOT_ACC_F32_V2BF16, 0xd, "v_dot2acc_f32_bf16">; } +class Dot2F32NoModsPat <SDPatternOperator node, Instruction inst, ValueType ty> + : GCNPat < + (f32 (node (ty (VOP3PNoModsDOT ty:$src0)), (ty (VOP3PNoModsDOT ty:$src1)), + (f32 (VOP3PNoModsF32 f32:$src2)), (i1 DSTCLAMP.NONE))), + (f32 (inst $src0, $src1, $src2)) +>; + let AddedComplexity = 30 in { - def : GCNPat< - (f32 (AMDGPUfdot2 v2f16:$src0, v2f16:$src1, f32:$src2, (i1 DSTCLAMP.NONE))), - (f32 (V_DOT2C_F32_F16_e32 $src0, $src1, $src2)) - > { - let SubtargetPredicate = HasDot5Insts; - } - def : GCNPat< - (f32 (int_amdgcn_fdot2_f32_bf16 v2bf16:$src0, v2bf16:$src1, f32:$src2, (i1 DSTCLAMP.NONE))), - (f32 (V_DOT2C_F32_BF16_e32 $src0, $src1, $src2)) - > { - let SubtargetPredicate = HasDot13Insts; - } + let SubtargetPredicate = HasDot5Insts in + def : Dot2F32NoModsPat<AMDGPUfdot2, V_DOT2C_F32_F16_e32, v2f16>; + + let SubtargetPredicate = HasDot13Insts in + def : Dot2F32NoModsPat<int_amdgcn_fdot2_f32_bf16, V_DOT2C_F32_BF16_e32, + v2bf16>; + def : GCNPat< (i32 (int_amdgcn_sdot4 i32:$src0, i32:$src1, i32:$src2, (i1 DSTCLAMP.NONE))), (i32 (V_DOT4C_I32_I8_e32 $src0, $src1, $src2)) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll index 0d93cfe52af54a..388accbbbadc6f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll @@ -31,63 +31,27 @@ define float @v_fdot2_clamp(<2 x half> %a, <2 x half> %b, float %c) { } define float @v_fdot2_neg_a(<2 x half> %a, <2 x half> %b, float %c) { -; GFX906-LABEL: v_fdot2_neg_a: -; GFX906: ; %bb.0: -; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] -; -; GFX10-LABEL: v_fdot2_neg_a: -; GFX10: ; %bb.0: -; GFX10: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX10: v_dot2c_f32_f16 v2, v0, v1 -; GFX10: v_mov_b32_e32 v0, v2 -; -; GFX11-LABEL: v_fdot2_neg_a: -; GFX11: ; %bb.0: -; GFX11: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX11: v_dot2acc_f32_f16 v2, v0, v1 -; GFX11: v_mov_b32_e32 v0, v2 +; GCN-LABEL: v_fdot2_neg_a: +; GCN: ; %bb.0: +; GCN: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] %neg.a = fneg <2 x half> %a %r = call float @llvm.amdgcn.fdot2(<2 x half> %neg.a, <2 x half> %b, float %c, i1 false) ret float %r } define float @v_fdot2_neg_b(<2 x half> %a, <2 x half> %b, float %c) { -; GFX906-LABEL: v_fdot2_neg_b: -; GFX906: ; %bb.0: -; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] -; -; GFX10-LABEL: v_fdot2_neg_b: -; GFX10: ; %bb.0: -; GFX10: v_xor_b32_e32 v1, 0x80008000, v1 -; GFX10: v_dot2c_f32_f16 v2, v0, v1 -; GFX10: v_mov_b32_e32 v0, v2 -; -; GFX11-LABEL: v_fdot2_neg_b: -; GFX11: ; %bb.0: -; GFX11: v_xor_b32_e32 v1, 0x80008000, v1 -; GFX11: v_dot2acc_f32_f16 v2, v0, v1 -; GFX11: v_mov_b32_e32 v0, v2 +; GCN-LABEL: v_fdot2_neg_b: +; GCN: ; %bb.0: +; GCN: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] %neg.b = fneg <2 x half> %b %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %neg.b, float %c, i1 false) ret float %r } define float @v_fdot2_neg_a_neg_b(<2 x half> %a, <2 x half> %b, float %c) { -; GFX906-LABEL: v_fdot2_neg_a_neg_b: -; GFX906: ; %bb.0: -; GFX906: v_dot2_f32_f16 v0, v1, v1, v2 neg_lo:[1,1,0] neg_hi:[1,1,0] -; -; GFX10-LABEL: v_fdot2_neg_a_neg_b: -; GFX10: ; %bb.0: -; GFX10: v_mov_b32_e32 v0, v2 -; GFX10: v_xor_b32_e32 v1, 0x80008000, v1 -; GFX10: v_dot2c_f32_f16 v0, v1, v1 -; -; GFX11-LABEL: v_fdot2_neg_a_neg_b: -; GFX11: ; %bb.0: -; GFX11: v_mov_b32_e32 v0, v2 -; GFX11: v_xor_b32_e32 v1, 0x80008000, v1 -; GFX11: v_dot2acc_f32_f16 v0, v1, v1 +; GCN-LABEL: v_fdot2_neg_a_neg_b: +; GCN: ; %bb.0: +; GCN: v_dot2_f32_f16 v0, v1, v1, v2 neg_lo:[1,1,0] neg_hi:[1,1,0] %neg.a = fneg <2 x half> %b %neg.b = fneg <2 x half> %b %r = call float @llvm.amdgcn.fdot2(<2 x half> %neg.a, <2 x half> %neg.b, float %c, i1 false) @@ -97,7 +61,8 @@ define float @v_fdot2_neg_a_neg_b(<2 x half> %a, <2 x half> %b, float %c) { define float @v_fdot2_neg_c(<2 x half> %a, <2 x half> %b, float %c) { ; GFX906-LABEL: v_fdot2_neg_c: ; GFX906: ; %bb.0: -; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] +; GFX906: v_xor_b32_e32 v2, 0x80000000, v2 +; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 ; ; GFX10-LABEL: v_fdot2_neg_c: ; GFX10: ; %bb.0: @@ -107,9 +72,7 @@ define float @v_fdot2_neg_c(<2 x half> %a, <2 x half> %b, float %c) { ; ; GFX11-LABEL: v_fdot2_neg_c: ; GFX11: ; %bb.0: -; GFX11: v_xor_b32_e32 v2, 0x80000000, v2 -; GFX11: v_dot2acc_f32_f16 v2, v0, v1 -; GFX11: v_mov_b32_e32 v0, v2 +; GFX11: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] %neg.c = fneg float %c %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %neg.c, i1 false) ret float %r diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll index 6cfa02501adc58..e03b57cac6ab22 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll @@ -19,15 +19,9 @@ define float @v_fdot2_f32_bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %c) { } define float @v_fdot2_f32_bf16_neg_a(<2 x bfloat> %a, <2 x bfloat> %b, float %c) { -; GFX950-LABEL: v_fdot2_f32_bf16_neg_a: -; GFX950: ; %bb.0: -; GFX950: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX950: v_dot2c_f32_bf16_e32 v2, v0, v1 -; GFX950: v_mov_b32_e32 v0, v2 -; -; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_a: -; GFX11PLUS: ; %bb.0: -; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GCN-LABEL: v_fdot2_f32_bf16_neg_a: +; GCN: ; %bb.0: +; GCN: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] %neg.a = fneg <2 x bfloat> %a %r = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %neg.a, <2 x bfloat> %b, float %c, i1 false) ret float %r @@ -88,15 +82,9 @@ define float @v_fdot2_f32_bf16_neg_a_hi(<2 x bfloat> %a, <2 x bfloat> %b, float } define float @v_fdot2_f32_bf16_neg_b(<2 x bfloat> %a, <2 x bfloat> %b, float %c) { -; GFX950-LABEL: v_fdot2_f32_bf16_neg_b: -; GFX950: ; %bb.0: -; GFX950: v_xor_b32_e32 v1, 0x80008000, v1 -; GFX950: v_dot2c_f32_bf16_e32 v2, v0, v1 -; GFX950: v_mov_b32_e32 v0, v2 -; -; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_b: -; GFX11PLUS: ; %bb.0: -; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GCN-LABEL: v_fdot2_f32_bf16_neg_b: +; GCN: ; %bb.0: +; GCN: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] %neg.b = fneg <2 x bfloat> %b %r = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %neg.b, float %c, i1 false) ret float %r @@ -157,30 +145,18 @@ define float @v_fdot2_f32_bf16_neg_b_hi(<2 x bfloat> %a, <2 x bfloat> %b, float } define float @v_fdot2_f32_bf16_neg_c(<2 x bfloat> %a, <2 x bfloat> %b, float %c) { -; GFX950-LABEL: v_fdot2_f32_bf16_neg_c: -; GFX950: ; %bb.0: -; GFX950: v_xor_b32_e32 v2, 0x80000000, v2 -; GFX950: v_dot2c_f32_bf16_e32 v2, v0, v1 -; GFX950: v_mov_b32_e32 v0, v2 -; -; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_c: -; GFX11PLUS: ; %bb.0: -; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,0,1] +; GCN-LABEL: v_fdot2_f32_bf16_neg_c: +; GCN: ; %bb.0: +; GCN: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,0,1] %neg.c = fneg float %c %r = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %neg.c, i1 false) ret float %r } define float @v_fdot2_f32_bf16_abs_c(<2 x bfloat> %a, <2 x bfloat> %b, float %c) { -; GFX950-LABEL: v_fdot2_f32_bf16_abs_c: -; GFX950: ; %bb.0: -; GFX950: v_and_b32_e32 v2, 0x7fffffff, v2 -; GFX950: v_dot2c_f32_bf16_e32 v2, v0, v1 -; GFX950: v_mov_b32_e32 v0, v2 -; -; GFX11PLUS-LABEL: v_fdot2_f32_bf16_abs_c: -; GFX11PLUS: ; %bb.0: -; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 neg_hi:[0,0,1] +; GCN-LABEL: v_fdot2_f32_bf16_abs_c: +; GCN: ; %bb.0: +; GCN: v_dot2_f32_bf16 v0, v0, v1, v2 neg_hi:[0,0,1] %abs.c = call float @llvm.fabs.f32(float %c) %r = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %abs.c, i1 false) ret float %r @@ -497,10 +473,9 @@ define float @v_fdot2_f32_bf16_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, define float @v_fdot2_f32_bf16_neg_a_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) { ; GFX950-LABEL: v_fdot2_f32_bf16_neg_a_dual: ; GFX950: ; %bb.0: -; GFX950: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX950: v_dot2c_f32_bf16_e32 v2, v0, v1 +; GFX950: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX950: v_dot2c_f32_bf16_e32 v5, v3, v4 -; GFX950: v_add_f32_e32 v0, v2, v5 +; GFX950: v_add_f32_e32 v0, v0, v5 ; ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_a_dual: ; GFX11PLUS: ; %bb.0: @@ -585,10 +560,9 @@ define float @v_fdot2_f32_bf16_neg_a_hi_dual(<2 x bfloat> %a, <2 x bfloat> %b, f define float @v_fdot2_f32_bf16_neg_b_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) { ; GFX950-LABEL: v_fdot2_f32_bf16_neg_b_dual: ; GFX950: ; %bb.0: -; GFX950: v_xor_b32_e32 v1, 0x80008000, v1 -; GFX950: v_dot2c_f32_bf16_e32 v2, v0, v1 +; GFX950: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] ; GFX950: v_dot2c_f32_bf16_e32 v5, v3, v4 -; GFX950: v_add_f32_e32 v0, v2, v5 +; GFX950: v_add_f32_e32 v0, v0, v5 ; ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_b_dual: ; GFX11PLUS: ; %bb.0: @@ -673,10 +647,9 @@ define float @v_fdot2_f32_bf16_neg_b_hi_dual(<2 x bfloat> %a, <2 x bfloat> %b, f define float @v_fdot2_f32_bf16_neg_c_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) { ; GFX950-LABEL: v_fdot2_f32_bf16_neg_c_dual: ; GFX950: ; %bb.0: -; GFX950: v_xor_b32_e32 v2, 0x80000000, v2 -; GFX950: v_dot2c_f32_bf16_e32 v2, v0, v1 +; GFX950: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,0,1] ; GFX950: v_dot2c_f32_bf16_e32 v5, v3, v4 -; GFX950: v_add_f32_e32 v0, v2, v5 +; GFX950: v_add_f32_e32 v0, v0, v5 ; ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_c_dual: ; GFX11PLUS: ; %bb.0: @@ -693,10 +666,9 @@ define float @v_fdot2_f32_bf16_neg_c_dual(<2 x bfloat> %a, <2 x bfloat> %b, floa define float @v_fdot2_f32_bf16_abs_c_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) { ; GFX950-LABEL: v_fdot2_f32_bf16_abs_c_dual: ; GFX950: ; %bb.0: -; GFX950: v_and_b32_e32 v2, 0x7fffffff, v2 -; GFX950: v_dot2c_f32_bf16_e32 v2, v0, v1 +; GFX950: v_dot2_f32_bf16 v0, v0, v1, v2 neg_hi:[0,0,1] ; GFX950: v_dot2c_f32_bf16_e32 v5, v3, v4 -; GFX950: v_add_f32_e32 v0, v2, v5 +; GFX950: v_add_f32_e32 v0, v0, v5 ; ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_abs_c_dual: ; GFX11PLUS: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll index c0f1240e4ef058..319d5b3e227601 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll @@ -40,35 +40,9 @@ define float @v_fdot2(<2 x half> %a, <2 x half> %b, float %c) { } define float @v_fdot2_neg_a(<2 x half> %a, <2 x half> %b, float %c) { -; GFX906-LABEL: v_fdot2_neg_a: -; GFX906: ; %bb.0: -; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] -; -; GFX950-LABEL: v_fdot2_neg_a: -; GFX950: ; %bb.0: -; GFX950: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX950: v_dot2c_f32_f16_e32 v2, v0, v1 -; GFX950: v_mov_b32_e32 v0, v2 -; -; GFX10-LABEL: v_fdot2_neg_a: -; GFX10: ; %bb.0: -; GFX10: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX10: v_dot2c_f32_f16 v2, v0, v1 -; GFX10: v_mov_b32_e32 v0, v2 -; -; GFX11-LABEL: v_fdot2_neg_a: -; GFX11: ; %bb.0: -; GFX11: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX11: v_dot2acc_f32_f16 v2, v0, v1 -; GFX11: v_mov_b32_e32 v0, v2 -; -; GFX1170-LABEL: v_fdot2_neg_a: -; GFX1170: ; %bb.0: -; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] -; -; GFX12-LABEL: v_fdot2_neg_a: -; GFX12: ; %bb.0: -; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GCN-LABEL: v_fdot2_neg_a: +; GCN: ; %bb.0: +; GCN: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] %neg.a = fneg <2 x half> %a %r = call float @llvm.amdgcn.fdot2(<2 x half> %neg.a, <2 x half> %b, float %c, i1 false) ret float %r @@ -89,10 +63,7 @@ define float @v_fdot2_neg_a_lo(<2 x half> %a, <2 x half> %b, float %c) { ; ; GFX10-LABEL: v_fdot2_neg_a_lo: ; GFX10: ; %bb.0: -; GFX10: v_xor_b32_e32 v3, 0x8000, v0 -; GFX10: v_bfi_b32 v0, 0xffff, v3, v0 -; GFX10: v_dot2c_f32_f16 v2, v0, v1 -; GFX10: v_mov_b32_e32 v0, v2 +; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] ; ; GFX11-LABEL: v_fdot2_neg_a_lo: ; GFX11: ; %bb.0: @@ -133,11 +104,7 @@ define float @v_fdot2_neg_a_hi(<2 x half> %a, <2 x half> %b, float %c) { ; ; GFX10-LABEL: v_fdot2_neg_a_hi: ; GFX10: ; %bb.0: -; GFX10: v_mov_b32_e32 v3, 0x8000 -; GFX10: v_xor_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10: v_perm_b32 v0, v3, v0, 0x5040100 -; GFX10: v_dot2c_f32_f16 v2, v0, v1 -; GFX10: v_mov_b32_e32 v0, v2 +; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[1,0,0] ; ; GFX11-LABEL: v_fdot2_neg_a_hi: ; GFX11: ; %bb.0: @@ -164,35 +131,9 @@ define float @v_fdot2_neg_a_hi(<2 x half> %a, <2 x half> %b, float %c) { } define float @v_fdot2_neg_b(<2 x half> %a, <2 x half> %b, float %c) { -; GFX906-LABEL: v_fdot2_neg_b: -; GFX906: ; %bb.0: -; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] -; -; GFX950-LABEL: v_fdot2_neg_b: -; GFX950: ; %bb.0: -; GFX950: v_xor_b32_e32 v1, 0x80008000, v1 -; GFX950: v_dot2c_f32_f16_e32 v2, v0, v1 -; GFX950: v_mov_b32_e32 v0, v2 -; -; GFX10-LABEL: v_fdot2_neg_b: -; GFX10: ; %bb.0: -; GFX10: v_xor_b32_e32 v1, 0x80008000, v1 -; GFX10: v_dot2c_f32_f16 v2, v0, v1 -; GFX10: v_mov_b32_e32 v0, v2 -; -; GFX11-LABEL: v_fdot2_neg_b: -; GFX11: ; %bb.0: -; GFX11: v_xor_b32_e32 v1, 0x80008000, v1 -; GFX11: v_dot2acc_f32_f16 v2, v0, v1 -; GFX11: v_mov_b32_e32 v0, v2 -; -; GFX1170-LABEL: v_fdot2_neg_b: -; GFX1170: ; %bb.0: -; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] -; -; GFX12-LABEL: v_fdot2_neg_b: -; GFX12: ; %bb.0: -; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GCN-LABEL: v_fdot2_neg_b: +; GCN: ; %bb.0: +; GCN: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] %neg.b = fneg <2 x half> %b %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %neg.b, float %c, i1 false) ret float %r @@ -213,10 +154,7 @@ define float @v_fdot2_neg_b_lo(<2 x half> %a, <2 x half> %b, float %c) { ; ; GFX10-LABEL: v_fdot2_neg_b_lo: ; GFX10: ; %bb.0: -; GFX10: v_xor_b32_e32 v3, 0x8000, v1 -; GFX10: v_bfi_b32 v1, 0xffff, v3, v1 -; GFX10: v_dot2c_f32_f16 v2, v0, v1 -; GFX10: v_mov_b32_e32 v0, v2 +; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] ; ; GFX11-LABEL: v_fdot2_neg_b_lo: ; GFX11: ; %bb.0: @@ -257,11 +195,7 @@ define float @v_fdot2_neg_b_hi(<2 x half> %a, <2 x half> %b, float %c) { ; ; GFX10-LABEL: v_fdot2_neg_b_hi: ; GFX10: ; %bb.0: -; GFX10: v_mov_b32_e32 v3, 0x8000 -; GFX10: v_xor_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10: v_perm_b32 v1, v3, v1, 0x5040100 -; GFX10: v_dot2c_f32_f16 v2, v0, v1 -; GFX10: v_mov_b32_e32 v0, v2 +; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,1,0] ; ; GFX11-LABEL: v_fdot2_neg_b_hi: ; GFX11: ; %bb.0: @@ -288,70 +222,18 @@ define float @v_fdot2_neg_b_hi(<2 x half> %a, <2 x half> %b, float %c) { } define float @v_fdot2_neg_c(<2 x half> %a, <2 x half> %b, float %c) { -; GFX906-LABEL: v_fdot2_neg_c: -; GFX906: ; %bb.0: -; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] -; -; GFX950-LABEL: v_fdot2_neg_c: -; GFX950: ; %bb.0: -; GFX950: v_xor_b32_e32 v2, 0x80000000, v2 -; GFX950: v_dot2c_f32_f16_e32 v2, v0, v1 -; GFX950: v_mov_b32_e32 v0, v2 -; -; GFX10-LABEL: v_fdot2_neg_c: -; GFX10: ; %bb.0: -; GFX10: v_xor_b32_e32 v2, 0x80000000, v2 -; GFX10: v_dot2c_f32_f16 v2, v0, v1 -; GFX10: v_mov_b32_e32 v0, v2 -; -; GFX11-LABEL: v_fdot2_neg_c: -; GFX11: ; %bb.0: -; GFX11: v_xor_b32_e32 v2, 0x80000000, v2 -; GFX11: v_dot2acc_f32_f16 v2, v0, v1 -; GFX11: v_mov_b32_e32 v0, v2 -; -; GFX1170-LABEL: v_fdot2_neg_c: -; GFX1170: ; %bb.0: -; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] -; -; GFX12-LABEL: v_fdot2_neg_c: -; GFX12: ; %bb.0: -; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] +; GCN-LABEL: v_fdot2_neg_c: +; GCN: ; %bb.0: +; GCN: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] %neg.c = fneg float %c %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %neg.c, i1 false) ret float %r } define float @v_fdot2_abs_c(<2 x half> %a, <2 x half> %b, float %c) { -; GFX906-LABEL: v_fdot2_abs_c: -; GFX906: ; %bb.0: -; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,0,1] -; -; GFX950-LABEL: v_fdot2_abs_c: -; GFX950: ; %bb.0: -; GFX950: v_and_b32_e32 v2, 0x7fffffff, v2 -; GFX950: v_dot2c_f32_f16_e32 v2, v0, v1 -; GFX950: v_mov_b32_e32 v0, v2 -; -; GFX10-LABEL: v_fdot2_abs_c: -; GFX10: ; %bb.0: -; GFX10: v_and_b32_e32 v2, 0x7fffffff, v2 -; GFX10: v_dot2c_f32_f16 v2, v0, v1 -; GFX10: v_mov_b32_e32 v0, v2 -; -; GFX11-LABEL: v_fdot2_abs_c: -; GFX11: ; %bb.0: -; GFX11: v_and_b32_e32 v2, 0x7fffffff, v2 -; GFX11: v_dot2acc_f32_f16 v2, v0, v1 -; GFX11: v_mov_b32_e32 v0, v2 -; -; GFX1170-LABEL: v_fdot2_abs_c: -; GFX1170: ; %bb.0: -; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,0,1] -; -; GFX12-LABEL: v_fdot2_abs_c: -; GFX12: ; %bb.0: -; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,0,1] +; GCN-LABEL: v_fdot2_abs_c: +; GCN: ; %bb.0: +; GCN: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,0,1] %abs.c = call float @llvm.fabs.f32(float %c) %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %abs.c, i1 false) ret float %r @@ -371,9 +253,7 @@ define float @v_fdot2_opsel_lo_a(<2 x half> %a, <2 x half> %b, float %c) { ; ; GFX10-LABEL: v_fdot2_opsel_lo_a: ; GFX10: ; %bb.0: -; GFX10: v_perm_b32 v0, v0, v0, 0x7060302 -; GFX10: v_dot2c_f32_f16 v2, v0, v1 -; GFX10: v_mov_b32_e32 v0, v2 +; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 op_sel:[1,0,0] ; ; GFX11-LABEL: v_fdot2_opsel_lo_a: ; GFX11: ; %bb.0: @@ -409,9 +289,7 @@ define float @v_fdot2_opsel_hi_a(<2 x half> %a, <2 x half> %b, float %c) { ; ; GFX10-LABEL: v_fdot2_opsel_hi_a: ; GFX10: ; %bb.0: -; GFX10: v_perm_b32 v0, v0, v0, 0x5040100 -; GFX10: v_dot2c_f32_f16 v2, v0, v1 -; GFX10: v_mov_b32_e32 v0, v2 +; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 op_sel_hi:[0,1,1] ; ; GFX11-LABEL: v_fdot2_opsel_hi_a: ; GFX11: ; %bb.0: @@ -447,9 +325,7 @@ define float @v_fdot2_opsel_lo_b(<2 x half> %a, <2 x half> %b, float %c) { ; ; GFX10-LABEL: v_fdot2_opsel_lo_b: ; GFX10: ; %bb.0: -; GFX10: v_perm_b32 v1, v1, v1, 0x7060302 -; GFX10: v_dot2c_f32_f16 v2, v0, v1 -; GFX10: v_mov_b32_e32 v0, v2 +; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 op_sel:[0,1,0] ; ; GFX11-LABEL: v_fdot2_opsel_lo_b: ; GFX11: ; %bb.0: @@ -485,9 +361,7 @@ define float @v_fdot2_opsel_hi_b(<2 x half> %a, <2 x half> %b, float %c) { ; ; GFX10-LABEL: v_fdot2_opsel_hi_b: ; GFX10: ; %bb.0: -; GFX10: v_perm_b32 v1, v1, v1, 0x5040100 -; GFX10: v_dot2c_f32_f16 v2, v0, v1 -; GFX10: v_mov_b32_e32 v0, v2 +; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 op_sel_hi:[1,0,1] ; ; GFX11-LABEL: v_fdot2_opsel_hi_b: ; GFX11: ; %bb.0: @@ -885,23 +759,21 @@ define float @v_fdot2_neg_a_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x ha ; ; GFX950-LABEL: v_fdot2_neg_a_dual: ; GFX950: ; %bb.0: -; GFX950: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX950: v_dot2c_f32_f16_e32 v2, v0, v1 +; GFX950: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX950: v_dot2c_f32_f16_e32 v5, v3, v4 -; GFX950: v_add_f32_e32 v0, v2, v5 +; GFX950: v_add_f32_e32 v0, v0, v5 ; ; GFX10-LABEL: v_fdot2_neg_a_dual: ; GFX10: ; %bb.0: -; GFX10: v_xor_b32_e32 v0, 0x80008000, v0 +; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX10: v_dot2c_f32_f16 v5, v3, v4 -; GFX10: v_dot2c_f32_f16 v2, v0, v1 -; GFX10: v_add_f32_e32 v0, v2, v5 +; GFX10: v_add_f32_e32 v0, v0, v5 ; ; GFX11-LABEL: v_fdot2_neg_a_dual: ; GFX11: ; %bb.0: -; GFX11: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX11: v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1 -; GFX11: v_add_f32_e32 v0, v2, v5 +; GFX11: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX11: v_dot2acc_f32_f16 v5, v3, v4 +; GFX11: v_add_f32_e32 v0, v0, v5 ; ; GFX1170-LABEL: v_fdot2_neg_a_dual: ; GFX1170: ; %bb.0: @@ -939,11 +811,9 @@ define float @v_fdot2_neg_a_lo_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x ; ; GFX10-LABEL: v_fdot2_neg_a_lo_dual: ; GFX10: ; %bb.0: -; GFX10: v_xor_b32_e32 v6, 0x8000, v0 +; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] ; GFX10: v_dot2c_f32_f16 v5, v3, v4 -; GFX10: v_bfi_b32 v0, 0xffff, v6, v0 -; GFX10: v_dot2c_f32_f16 v2, v0, v1 -; GFX10: v_add_f32_e32 v0, v2, v5 +; GFX10: v_add_f32_e32 v0, v0, v5 ; ; GFX11-LABEL: v_fdot2_neg_a_lo_dual: ; GFX11: ; %bb.0: @@ -993,12 +863,9 @@ define float @v_fdot2_neg_a_hi_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x ; ; GFX10-LABEL: v_fdot2_neg_a_hi_dual: ; GFX10: ; %bb.0: -; GFX10: v_mov_b32_e32 v6, 0x8000 +; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[1,0,0] ; GFX10: v_dot2c_f32_f16 v5, v3, v4 -; GFX10: v_xor_b32_sdwa v6, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10: v_perm_b32 v0, v6, v0, 0x5040100 -; GFX10: v_dot2c_f32_f16 v2, v0, v1 -; GFX10: v_add_f32_e32 v0, v2, v5 +; GFX10: v_add_f32_e32 v0, v0, v5 ; ; GFX11-LABEL: v_fdot2_neg_a_hi_dual: ; GFX11: ; %bb.0: @@ -1039,23 +906,21 @@ define float @v_fdot2_neg_b_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x ha ; ; GFX950-LABEL: v_fdot2_neg_b_dual: ; GFX950: ; %bb.0: -; GFX950: v_xor_b32_e32 v1, 0x80008000, v1 -; GFX950: v_dot2c_f32_f16_e32 v2, v0, v1 +; GFX950: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] ; GFX950: v_dot2c_f32_f16_e32 v5, v3, v4 -; GFX950: v_add_f32_e32 v0, v2, v5 +; GFX950: v_add_f32_e32 v0, v0, v5 ; ; GFX10-LABEL: v_fdot2_neg_b_dual: ; GFX10: ; %bb.0: -; GFX10: v_xor_b32_e32 v1, 0x80008000, v1 +; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] ; GFX10: v_dot2c_f32_f16 v5, v3, v4 -; GFX10: v_dot2c_f32_f16 v2, v0, v1 -; GFX10: v_add_f32_e32 v0, v2, v5 +; GFX10: v_add_f32_e32 v0, v0, v5 ; ; GFX11-LABEL: v_fdot2_neg_b_dual: ; GFX11: ; %bb.0: -; GFX11: v_xor_b32_e32 v1, 0x80008000, v1 -; GFX11: v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1 -; GFX11: v_add_f32_e32 v0, v2, v5 +; GFX11: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX11: v_dot2acc_f32_f16 v5, v3, v4 +; GFX11: v_add_f32_e32 v0, v0, v5 ; ; GFX1170-LABEL: v_fdot2_neg_b_dual: ; GFX1170: ; %bb.0: @@ -1093,11 +958,9 @@ define float @v_fdot2_neg_b_lo_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x ; ; GFX10-LABEL: v_fdot2_neg_b_lo_dual: ; GFX10: ; %bb.0: -; GFX10: v_xor_b32_e32 v6, 0x8000, v1 +; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] ; GFX10: v_dot2c_f32_f16 v5, v3, v4 -; GFX10: v_bfi_b32 v1, 0xffff, v6, v1 -; GFX10: v_dot2c_f32_f16 v2, v0, v1 -; GFX10: v_add_f32_e32 v0, v2, v5 +; GFX10: v_add_f32_e32 v0, v0, v5 ; ; GFX11-LABEL: v_fdot2_neg_b_lo_dual: ; GFX11: ; %bb.0: @@ -1147,12 +1010,9 @@ define float @v_fdot2_neg_b_hi_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x ; ; GFX10-LABEL: v_fdot2_neg_b_hi_dual: ; GFX10: ; %bb.0: -; GFX10: v_mov_b32_e32 v6, 0x8000 +; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,1,0] ; GFX10: v_dot2c_f32_f16 v5, v3, v4 -; GFX10: v_xor_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10: v_perm_b32 v1, v6, v1, 0x5040100 -; GFX10: v_dot2c_f32_f16 v2, v0, v1 -; GFX10: v_add_f32_e32 v0, v2, v5 +; GFX10: v_add_f32_e32 v0, v0, v5 ; ; GFX11-LABEL: v_fdot2_neg_b_hi_dual: ; GFX11: ; %bb.0: @@ -1193,23 +1053,21 @@ define float @v_fdot2_neg_c_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x ha ; ; GFX950-LABEL: v_fdot2_neg_c_dual: ; GFX950: ; %bb.0: -; GFX950: v_xor_b32_e32 v2, 0x80000000, v2 -; GFX950: v_dot2c_f32_f16_e32 v2, v0, v1 +; GFX950: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] ; GFX950: v_dot2c_f32_f16_e32 v5, v3, v4 -; GFX950: v_add_f32_e32 v0, v2, v5 +; GFX950: v_add_f32_e32 v0, v0, v5 ; ; GFX10-LABEL: v_fdot2_neg_c_dual: ; GFX10: ; %bb.0: -; GFX10: v_xor_b32_e32 v2, 0x80000000, v2 +; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] ; GFX10: v_dot2c_f32_f16 v5, v3, v4 -; GFX10: v_dot2c_f32_f16 v2, v0, v1 -; GFX10: v_add_f32_e32 v0, v2, v5 +; GFX10: v_add_f32_e32 v0, v0, v5 ; ; GFX11-LABEL: v_fdot2_neg_c_dual: ; GFX11: ; %bb.0: -; GFX11: v_xor_b32_e32 v2, 0x80000000, v2 -; GFX11: v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1 -; GFX11: v_add_f32_e32 v0, v2, v5 +; GFX11: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] +; GFX11: v_dot2acc_f32_f16 v5, v3, v4 +; GFX11: v_add_f32_e32 v0, v0, v5 ; ; GFX1170-LABEL: v_fdot2_neg_c_dual: ; GFX1170: ; %bb.0: @@ -1238,23 +1096,21 @@ define float @v_fdot2_abs_c_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x ha ; ; GFX950-LABEL: v_fdot2_abs_c_dual: ; GFX950: ; %bb.0: -; GFX950: v_and_b32_e32 v2, 0x7fffffff, v2 -; GFX950: v_dot2c_f32_f16_e32 v2, v0, v1 +; GFX950: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,0,1] ; GFX950: v_dot2c_f32_f16_e32 v5, v3, v4 -; GFX950: v_add_f32_e32 v0, v2, v5 +; GFX950: v_add_f32_e32 v0, v0, v5 ; ; GFX10-LABEL: v_fdot2_abs_c_dual: ; GFX10: ; %bb.0: -; GFX10: v_and_b32_e32 v2, 0x7fffffff, v2 +; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,0,1] ; GFX10: v_dot2c_f32_f16 v5, v3, v4 -; GFX10: v_dot2c_f32_f16 v2, v0, v1 -; GFX10: v_add_f32_e32 v0, v2, v5 +; GFX10: v_add_f32_e32 v0, v0, v5 ; ; GFX11-LABEL: v_fdot2_abs_c_dual: ; GFX11: ; %bb.0: -; GFX11: v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_and_b32 v2, 0x7fffffff, v2 -; GFX11: v_dot2acc_f32_f16 v2, v0, v1 -; GFX11: v_add_f32_e32 v0, v2, v5 +; GFX11: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,0,1] +; GFX11: v_dot2acc_f32_f16 v5, v3, v4 +; GFX11: v_add_f32_e32 v0, v0, v5 ; ; GFX1170-LABEL: v_fdot2_abs_c_dual: ; GFX1170: ; %bb.0: @@ -1291,10 +1147,9 @@ define float @v_fdot2_opsel_lo_a_dual(<2 x half> %a, <2 x half> %b, float %c, <2 ; ; GFX10-LABEL: v_fdot2_opsel_lo_a_dual: ; GFX10: ; %bb.0: -; GFX10: v_perm_b32 v0, v0, v0, 0x7060302 +; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 op_sel:[1,0,0] ; GFX10: v_dot2c_f32_f16 v5, v3, v4 -; GFX10: v_dot2c_f32_f16 v2, v0, v1 -; GFX10: v_add_f32_e32 v0, v2, v5 +; GFX10: v_add_f32_e32 v0, v0, v5 ; ; GFX11-LABEL: v_fdot2_opsel_lo_a_dual: ; GFX11: ; %bb.0: @@ -1339,10 +1194,9 @@ define float @v_fdot2_opsel_hi_a_dual(<2 x half> %a, <2 x half> %b, float %c, <2 ; ; GFX10-LABEL: v_fdot2_opsel_hi_a_dual: ; GFX10: ; %bb.0: -; GFX10: v_perm_b32 v0, v0, v0, 0x5040100 +; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 op_sel_hi:[0,1,1] ; GFX10: v_dot2c_f32_f16 v5, v3, v4 -; GFX10: v_dot2c_f32_f16 v2, v0, v1 -; GFX10: v_add_f32_e32 v0, v2, v5 +; GFX10: v_add_f32_e32 v0, v0, v5 ; ; GFX11-LABEL: v_fdot2_opsel_hi_a_dual: ; GFX11: ; %bb.0: @@ -1387,10 +1241,9 @@ define float @v_fdot2_opsel_lo_b_dual(<2 x half> %a, <2 x half> %b, float %c, <2 ; ; GFX10-LABEL: v_fdot2_opsel_lo_b_dual: ; GFX10: ; %bb.0: -; GFX10: v_perm_b32 v1, v1, v1, 0x7060302 +; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 op_sel:[0,1,0] ; GFX10: v_dot2c_f32_f16 v5, v3, v4 -; GFX10: v_dot2c_f32_f16 v2, v0, v1 -; GFX10: v_add_f32_e32 v0, v2, v5 +; GFX10: v_add_f32_e32 v0, v0, v5 ; ; GFX11-LABEL: v_fdot2_opsel_lo_b_dual: ; GFX11: ; %bb.0: @@ -1435,10 +1288,9 @@ define float @v_fdot2_opsel_hi_b_dual(<2 x half> %a, <2 x half> %b, float %c, <2 ; ; GFX10-LABEL: v_fdot2_opsel_hi_b_dual: ; GFX10: ; %bb.0: -; GFX10: v_perm_b32 v1, v1, v1, 0x5040100 +; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 op_sel_hi:[1,0,1] ; GFX10: v_dot2c_f32_f16 v5, v3, v4 -; GFX10: v_dot2c_f32_f16 v2, v0, v1 -; GFX10: v_add_f32_e32 v0, v2, v5 +; GFX10: v_add_f32_e32 v0, v0, v5 ; ; GFX11-LABEL: v_fdot2_opsel_hi_b_dual: ; GFX11: ; %bb.0: _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
