https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/196516
>From b24bc4da58fa8b9819f28c18013a3bfd9e86746b Mon Sep 17 00:00:00 2001 From: Petar Avramovic <[email protected]> Date: Fri, 8 May 2026 11:07:09 +0200 Subject: [PATCH] AMDGPU: Reland: Codegen for v_dual_dot2acc_f32_f16/bf16 from VOP3 For V_DOT2_F32_F16 and V_DOT2_F32_BF16 add their VOPDName and mark them with usesCustomInserter which will be used to add pre-RA register allocation hints to preferably assign dst and src2 to the same physical register. When the hint is satisfied, canMapVOP3PToVOPD recognises the instruction as eligible for VOPD pairing by checking if it is VOP2 like: dst==src2, no source modifiers, no clamp, and src1 is a register. Mark both instructions as commutable to allow a literal in src1 to be moved to src0, since VOPD only permits a literal in src0. Original patch had a bug where it did not check if physical src registers match register class of appropriate operand in fullVOPD instructions, check is now done via isValidVOPDSrc. --- llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp | 35 +- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 8 + .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 6 + llvm/lib/Target/AMDGPU/VOP3PInstructions.td | 13 +- llvm/lib/Target/AMDGPU/VOPInstructions.td | 4 +- .../AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll | 232 +++-- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll | 972 ++++++++---------- 7 files changed, 668 insertions(+), 602 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp index 8297ba7e3425c..e6c9fb6d168d1 100644 --- a/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp +++ b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp @@ -56,6 +56,38 @@ bool isValidVOPDSrc(const SIInstrInfo &TII, int VOPDOpc, unsigned CompIdx, return TII.getRegClass(TII.get(VOPDOpc), OpIdx)->contains(PhysSrcReg); } +static const MachineOperand &getNamedOp(const MachineInstr &MI, + AMDGPU::OpName Name) { + return MI.getOperand(getNamedOperandIdx(MI.getOpcode(), Name)); +} + +// Check if MI is a VOP3P instruction with operands that satisfy the constraints +// for mapping it to a VOP2/VOPD opcode: no modifiers, no clamp, src1 and src2 +// are registers (src0 can be register or literal), and src2 is same as dst. +static bool canMapVOP3PToVOPD(const MachineInstr &MI) { + unsigned Opc = MI.getOpcode(); + if (Opc != AMDGPU::V_DOT2_F32_F16 && Opc != AMDGPU::V_DOT2_F32_BF16) + return false; + // src0 can be register or literal + if (getNamedOp(MI, AMDGPU::OpName::src0_modifiers).getImm() != + SISrcMods::OP_SEL_1) + return false; + if (getNamedOp(MI, AMDGPU::OpName::src1_modifiers).getImm() != + SISrcMods::OP_SEL_1) + return false; + if (!getNamedOp(MI, AMDGPU::OpName::src1).isReg()) + return false; + if (getNamedOp(MI, AMDGPU::OpName::src2_modifiers).getImm() != + SISrcMods::OP_SEL_1) + return false; + if (!getNamedOp(MI, AMDGPU::OpName::src2).isReg()) + return false; + if (getNamedOp(MI, AMDGPU::OpName::clamp).getImm() != 0) + return false; + return getNamedOp(MI, AMDGPU::OpName::vdst).getReg() == + getNamedOp(MI, AMDGPU::OpName::src2).getReg(); +} + bool llvm::checkVOPDRegConstraints(const SIInstrInfo &TII, const MachineInstr &MIX, const MachineInstr &MIY, bool IsVOPD3, @@ -67,7 +99,8 @@ bool llvm::checkVOPDRegConstraints(const SIInstrInfo &TII, if (IsVOPD3 && !ST.hasVOPD3()) return false; - if (!IsVOPD3 && (TII.isVOP3(MIX) || TII.isVOP3(MIY))) + if (!IsVOPD3 && ((TII.isVOP3(MIX) && !canMapVOP3PToVOPD(MIX)) || + (TII.isVOP3(MIY) && !canMapVOP3PToVOPD(MIY)))) return false; if (TII.isDPP(MIX) || TII.isDPP(MIY)) return false; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 2817f1ac8c3ad..c87a9dfb0603a 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -7319,6 +7319,14 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MI.getOperand(0).setReg(OriginalExec); return BB; } + case AMDGPU::V_DOT2_F32_F16: + case AMDGPU::V_DOT2_F32_BF16: { + // Hint RA to assign dst and src2 the same physical register. + // For targets without VOP2, but with VOPD, variant of the instruction this + // is one of the conditions to attempt converting VOP3P to VOPD. + MRI.setSimpleHint(MI.getOperand(0).getReg(), MI.getOperand(6).getReg()); + return BB; + } default: if (TII->isImage(MI) || TII->isMUBUF(MI)) { if (!MI.mayStore()) diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 7c283be411b20..1b2c7f368fd06 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -947,6 +947,12 @@ ComponentProps::ComponentProps(const MCInstrDesc &OpDesc, bool VOP3Layout) { NumVOPD3Mods = 2; if (IsVOP3) SrcOperandsNum = 3; + } else if (Opcode == AMDGPU::V_DOT2_F32_F16 || + Opcode == AMDGPU::V_DOT2_F32_BF16) { + // VOP3P opcodes that have VOPD but don't have VOP2 version. Using VOPD3 + // path in getIndexOfSrcInMCOperands to get correct src operand indexes, + // but generating VOPD, not VOPD3. + NumVOPD3Mods = SrcOperandsNum; } else if (isSISrcFPOperand(OpDesc, getNamedOperandIdx(Opcode, OpName::src0))) { // All FP VOPD instructions have Neg modifiers for all operands except diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 79250421f5df7..2d7046afc5279 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -87,11 +87,13 @@ multiclass VOP3PInst<string OpName, VOPProfile P, } multiclass VOP3PInstDotWithDual<string OpName, VOPProfile P, - SDPatternOperator node = null_frag> { + SDPatternOperator node = null_frag, + bits<6> VOPDOp, string VOPDName> { def NAME : VOP3P_Pseudo<OpName, P, getVOP3PModPat<P, node, 1 /*HasExplicitClamp*/, 1/*IsDOT*/, - VOP3PModsDOT, VOP3PModsF32>.ret>; + VOP3PModsDOT, VOP3PModsF32>.ret>, + VOPD_Component<VOPDOp, VOPDName>; let SubtargetPredicate = isGFX11Plus in { if P.HasExtVOP3DPP then def _dpp : VOP3_DPP_Pseudo<OpName, P> { @@ -692,12 +694,12 @@ defm V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16", VOP3P_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_udot2, 1>; } // End OtherPredicates = [HasDot2Insts] -let OtherPredicates = [HasDot10Insts] in +let OtherPredicates = [HasDot10Insts], isCommutable = 1, usesCustomInserter = 1 in defm V_DOT2_F32_F16 : VOP3PInstDotWithDual<"v_dot2_f32_f16", VOP3P_Profile<VOP_F32_V2F16_V2F16_F32, VOP3_REGULAR, /*HasDPP*/ 1>, - AMDGPUfdot2>; + AMDGPUfdot2, 0xC, "v_dot2acc_f32_f16">; let OtherPredicates = [HasDot7Insts] in { defm V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8", @@ -721,9 +723,10 @@ def DOT2_BF16_Profile let SubtargetPredicate = HasDot12Insts in { +let isCommutable = 1, usesCustomInserter = 1 in defm V_DOT2_F32_BF16 : VOP3PInstDotWithDual<"v_dot2_f32_bf16", DOT2_BF16_Profile, - int_amdgcn_fdot2_f32_bf16>; + int_amdgcn_fdot2_f32_bf16, 0xD, "v_dot2acc_f32_bf16">; } // End SubtargetPredicate = HasDot12Insts diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index c2543e75dea6a..a379785616c6c 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -34,8 +34,8 @@ class VOP <string opName> { string OpName = opName; } -// First 13 insts from VOPDY are also VOPDX. DOT2ACC_F32_BF16 is omitted -defvar VOPDX_Max_Index = 12; +// First 13 insts from VOPDY are also VOPDX. +defvar VOPDX_Max_Index = 13; defvar VOPD3X_Max_Index = 36; class VOPD_Component<bits<6> OpIn, string vOPDName> { diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll index 1dff54ac35427..067e3e6c2ca67 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll @@ -225,7 +225,7 @@ define float @v_fdot2_f32_bf16_inline_literal_b(<2 x bfloat> %a, float %c) { ; ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_b: ; GFX11PLUS: ; %bb.0: -; GFX11PLUS: v_dot2_f32_bf16 v0, v0, 0x40004000, v1 +; GFX11PLUS: v_dot2_f32_bf16 v0, 0x40004000, v0, v1 %ret = tail call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> <bfloat 2.0, bfloat 2.0>, float %c, i1 false) ret float %ret } @@ -373,7 +373,7 @@ define float @v_fdot2_f32_bf16_inline_literal_b_clamp(<2 x bfloat> %a, float %c) ; ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_b_clamp: ; GFX11PLUS: ; %bb.0: -; GFX11PLUS: v_dot2_f32_bf16 v0, v0, 0x40004000, v1 clamp +; GFX11PLUS: v_dot2_f32_bf16 v0, 0x40004000, v0, v1 clamp %ret = tail call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> <bfloat 2.0, bfloat 2.0>, float %c, i1 true) ret float %ret } @@ -395,9 +395,116 @@ define float @v_fdot2_f32_bf16_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, ; ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_dual: ; GFX11PLUS: ; %bb.0: -; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 -; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5 -; GFX11PLUS: v_add_f32_e32 v0, v0, v1 +; GFX11PLUS: v_dual_dot2acc_f32_bf16 v2, v0, v1 :: v_dual_dot2acc_f32_bf16 v5, v3, v4 +; GFX11PLUS: v_add_f32_e32 v0, v2, v5 + %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %c, i1 false) + %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false) + %r = fadd float %r0, %r1 + ret float %r +} + +define float @v_fdot2_f32_bf16_dual_sgpr_src0_x(<2 x bfloat> inreg %a, <2 x bfloat> %b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) { +; GFX950-LABEL: v_fdot2_f32_bf16_dual_sgpr_src0_x: +; GFX950: ; %bb.0: +; GFX950: v_dot2c_f32_bf16_e32 v1, s0, v0 +; GFX950: v_dot2c_f32_bf16_e32 v4, v2, v3 +; GFX950: v_add_f32_e32 v0, v1, v4 +; +; GFX11PLUS-LABEL: v_fdot2_f32_bf16_dual_sgpr_src0_x: +; GFX11PLUS: ; %bb.0: +; GFX11PLUS: v_dual_dot2acc_f32_bf16 v1, s0, v0 :: v_dual_dot2acc_f32_bf16 v4, v2, v3 +; GFX11PLUS: v_add_f32_e32 v0, v1, v4 + %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %c, i1 false) + %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false) + %r = fadd float %r0, %r1 + ret float %r +} + +define float @v_fdot2_f32_bf16_dual_sgpr_src1_x(<2 x bfloat> %a, <2 x bfloat> inreg %b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) { +; GFX950-LABEL: v_fdot2_f32_bf16_dual_sgpr_src1_x: +; GFX950: ; %bb.0: +; GFX950: v_dot2c_f32_bf16_e32 v1, s0, v0 +; GFX950: v_dot2c_f32_bf16_e32 v4, v2, v3 +; GFX950: v_add_f32_e32 v0, v1, v4 +; +; GFX11PLUS-LABEL: v_fdot2_f32_bf16_dual_sgpr_src1_x: +; GFX11PLUS: ; %bb.0: +; GFX11PLUS: v_dot2_f32_bf16 v1, v0, s0, v1 +; GFX11PLUS: v_dot2_f32_bf16 v4, v2, v3, v4 +; GFX11PLUS: v_add_f32_e32 v0, v1, v4 + %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %c, i1 false) + %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false) + %r = fadd float %r0, %r1 + ret float %r +} + +define float @v_fdot2_f32_bf16_dual_sgpr_src2_x(<2 x bfloat> %a, <2 x bfloat> %b, float inreg %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) { +; GFX950-LABEL: v_fdot2_f32_bf16_dual_sgpr_src2_x: +; GFX950: ; %bb.0: +; GFX950: v_mov_b32_e32 v5, s0 +; GFX950: v_dot2c_f32_bf16_e32 v5, v0, v1 +; GFX950: v_dot2c_f32_bf16_e32 v4, v2, v3 +; GFX950: v_add_f32_e32 v0, v5, v4 +; +; GFX11PLUS-LABEL: v_fdot2_f32_bf16_dual_sgpr_src2_x: +; GFX11PLUS: ; %bb.0: +; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, s0 +; GFX11PLUS: v_dot2_f32_bf16 v4, v2, v3, v4 +; GFX11PLUS: v_add_f32_e32 v0, v0, v4 + %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %c, i1 false) + %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false) + %r = fadd float %r0, %r1 + ret float %r +} + +define float @v_fdot2_f32_bf16_dual_sgpr_src0_y(<2 x bfloat> %a, <2 x bfloat> %b, float %c, <2 x bfloat> inreg %d, <2 x bfloat> %e, <2 x bfloat> %vopd_dst_pad, float %f) { +; GFX950-LABEL: v_fdot2_f32_bf16_dual_sgpr_src0_y: +; GFX950: ; %bb.0: +; GFX950: v_dot2c_f32_bf16_e32 v2, v0, v1 +; GFX950: v_dot2c_f32_bf16_e32 v5, s0, v3 +; GFX950: v_add_f32_e32 v0, v2, v5 +; +; GFX11PLUS-LABEL: v_fdot2_f32_bf16_dual_sgpr_src0_y: +; GFX11PLUS: ; %bb.0: +; GFX11PLUS: v_dual_dot2acc_f32_bf16 v2, v0, v1 :: v_dual_dot2acc_f32_bf16 v5, s0, v3 +; GFX11PLUS: v_add_f32_e32 v0, v2, v5 + %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %c, i1 false) + %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false) + %r = fadd float %r0, %r1 + ret float %r +} + +define float @v_fdot2_f32_bf16_dual_sgpr_src1_y(<2 x bfloat> %a, <2 x bfloat> %b, float %c, <2 x bfloat> %d, <2 x bfloat> %vopd_dst_pad, <2 x bfloat> inreg %e, float %f) { +; GFX950-LABEL: v_fdot2_f32_bf16_dual_sgpr_src1_y: +; GFX950: ; %bb.0: +; GFX950: v_dot2c_f32_bf16_e32 v2, v0, v1 +; GFX950: v_dot2c_f32_bf16_e32 v5, s0, v3 +; GFX950: v_add_f32_e32 v0, v2, v5 +; +; GFX11PLUS-LABEL: v_fdot2_f32_bf16_dual_sgpr_src1_y: +; GFX11PLUS: ; %bb.0: +; GFX11PLUS: v_dot2_f32_bf16 v2, v0, v1, v2 +; GFX11PLUS: v_dot2_f32_bf16 v5, v3, s0, v5 +; GFX11PLUS: v_add_f32_e32 v0, v2, v5 + %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %c, i1 false) + %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false) + %r = fadd float %r0, %r1 + ret float %r +} + +define float @v_fdot2_f32_bf16_dual_sgpr_src2_y(<2 x bfloat> %a, <2 x bfloat> %b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float inreg %f) { +; GFX950-LABEL: v_fdot2_f32_bf16_dual_sgpr_src2_y: +; GFX950: ; %bb.0: +; GFX950: v_dot2c_f32_bf16_e32 v2, v0, v1 +; GFX950: v_mov_b32_e32 v0, s0 +; GFX950: v_dot2c_f32_bf16_e32 v0, v3, v4 +; GFX950: v_add_f32_e32 v0, v2, v0 +; +; GFX11PLUS-LABEL: v_fdot2_f32_bf16_dual_sgpr_src2_y: +; GFX11PLUS: ; %bb.0: +; GFX11PLUS: v_dot2_f32_bf16 v2, v0, v1, v2 +; GFX11PLUS: v_dot2_f32_bf16 v0, v3, v4, s0 +; GFX11PLUS: v_add_f32_e32 v0, v2, v0 %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %c, i1 false) %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false) %r = fadd float %r0, %r1 @@ -407,15 +514,15 @@ define float @v_fdot2_f32_bf16_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, define float @v_fdot2_f32_bf16_neg_a_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) { ; GFX950-LABEL: v_fdot2_f32_bf16_neg_a_dual: ; GFX950: ; %bb.0: -; GFX950: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX950: v_dot2_f32_bf16 v2, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX950: v_dot2c_f32_bf16_e32 v5, v3, v4 -; GFX950: v_add_f32_e32 v0, v0, v5 +; GFX950: v_add_f32_e32 v0, v2, v5 ; ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_a_dual: ; GFX11PLUS: ; %bb.0: -; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] -; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5 -; GFX11PLUS: v_add_f32_e32 v0, v0, v1 +; GFX11PLUS: v_dot2_f32_bf16 v2, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX11PLUS: v_dot2_f32_bf16 v5, v3, v4, v5 +; GFX11PLUS: v_add_f32_e32 v0, v2, v5 %neg.a = fneg <2 x bfloat> %a %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %neg.a, <2 x bfloat> %b, float %c, i1 false) %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false) @@ -436,9 +543,8 @@ define float @v_fdot2_f32_bf16_neg_a_lo_dual(<2 x bfloat> %a, <2 x bfloat> %b, f ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_a_lo_dual: ; GFX11PLUS: ; %bb.0: ; GFX11PLUS: v_xor_b16 v0.l, 0x8000, v0.l -; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 -; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5 -; GFX11PLUS: v_add_f32_e32 v0, v0, v1 +; GFX11PLUS: v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1 +; GFX11PLUS: v_add_f32_e32 v0, v2, v5 %a_lo = extractelement <2 x bfloat> %a, i32 0 %neg.a_lo = fneg bfloat %a_lo %neg_lo.a = insertelement <2 x bfloat> %a, bfloat %neg.a_lo, i32 0 @@ -462,9 +568,8 @@ define float @v_fdot2_f32_bf16_neg_a_hi_dual(<2 x bfloat> %a, <2 x bfloat> %b, f ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_a_hi_dual: ; GFX11PLUS: ; %bb.0: ; GFX11PLUS: v_xor_b16 v0.h, 0x8000, v0.h -; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 -; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5 -; GFX11PLUS: v_add_f32_e32 v0, v0, v1 +; GFX11PLUS: v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1 +; GFX11PLUS: v_add_f32_e32 v0, v2, v5 %a_hi = extractelement <2 x bfloat> %a, i32 1 %neg.a_hi = fneg bfloat %a_hi %neg_hi.a = insertelement <2 x bfloat> %a, bfloat %neg.a_hi, i32 1 @@ -477,15 +582,15 @@ define float @v_fdot2_f32_bf16_neg_a_hi_dual(<2 x bfloat> %a, <2 x bfloat> %b, f define float @v_fdot2_f32_bf16_neg_b_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) { ; GFX950-LABEL: v_fdot2_f32_bf16_neg_b_dual: ; GFX950: ; %bb.0: -; GFX950: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX950: v_dot2_f32_bf16 v2, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] ; GFX950: v_dot2c_f32_bf16_e32 v5, v3, v4 -; GFX950: v_add_f32_e32 v0, v0, v5 +; GFX950: v_add_f32_e32 v0, v2, v5 ; ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_b_dual: ; GFX11PLUS: ; %bb.0: -; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] -; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5 -; GFX11PLUS: v_add_f32_e32 v0, v0, v1 +; GFX11PLUS: v_dot2_f32_bf16 v2, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX11PLUS: v_dot2_f32_bf16 v5, v3, v4, v5 +; GFX11PLUS: v_add_f32_e32 v0, v2, v5 %neg.b = fneg <2 x bfloat> %b %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %neg.b, float %c, i1 false) %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false) @@ -506,9 +611,8 @@ define float @v_fdot2_f32_bf16_neg_b_lo_dual(<2 x bfloat> %a, <2 x bfloat> %b, f ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_b_lo_dual: ; GFX11PLUS: ; %bb.0: ; GFX11PLUS: v_xor_b16 v1.l, 0x8000, v1.l -; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 -; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5 -; GFX11PLUS: v_add_f32_e32 v0, v0, v1 +; GFX11PLUS: v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1 +; GFX11PLUS: v_add_f32_e32 v0, v2, v5 %b_lo = extractelement <2 x bfloat> %b, i32 0 %neg.b_lo = fneg bfloat %b_lo %neg_lo.b = insertelement <2 x bfloat> %b, bfloat %neg.b_lo, i32 0 @@ -532,9 +636,8 @@ define float @v_fdot2_f32_bf16_neg_b_hi_dual(<2 x bfloat> %a, <2 x bfloat> %b, f ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_b_hi_dual: ; GFX11PLUS: ; %bb.0: ; GFX11PLUS: v_xor_b16 v1.h, 0x8000, v1.h -; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 -; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5 -; GFX11PLUS: v_add_f32_e32 v0, v0, v1 +; GFX11PLUS: v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1 +; GFX11PLUS: v_add_f32_e32 v0, v2, v5 %b_hi = extractelement <2 x bfloat> %b, i32 1 %neg.b_hi = fneg bfloat %b_hi %neg_hi.b = insertelement <2 x bfloat> %b, bfloat %neg.b_hi, i32 1 @@ -547,15 +650,15 @@ define float @v_fdot2_f32_bf16_neg_b_hi_dual(<2 x bfloat> %a, <2 x bfloat> %b, f define float @v_fdot2_f32_bf16_neg_c_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) { ; GFX950-LABEL: v_fdot2_f32_bf16_neg_c_dual: ; GFX950: ; %bb.0: -; GFX950: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,0,1] +; GFX950: v_dot2_f32_bf16 v2, v0, v1, v2 neg_lo:[0,0,1] ; GFX950: v_dot2c_f32_bf16_e32 v5, v3, v4 -; GFX950: v_add_f32_e32 v0, v0, v5 +; GFX950: v_add_f32_e32 v0, v2, v5 ; ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_c_dual: ; GFX11PLUS: ; %bb.0: -; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,0,1] -; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5 -; GFX11PLUS: v_add_f32_e32 v0, v0, v1 +; GFX11PLUS: v_dot2_f32_bf16 v2, v0, v1, v2 neg_lo:[0,0,1] +; GFX11PLUS: v_dot2_f32_bf16 v5, v3, v4, v5 +; GFX11PLUS: v_add_f32_e32 v0, v2, v5 %neg.c = fneg float %c %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %neg.c, i1 false) %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false) @@ -566,15 +669,15 @@ define float @v_fdot2_f32_bf16_neg_c_dual(<2 x bfloat> %a, <2 x bfloat> %b, floa define float @v_fdot2_f32_bf16_abs_c_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) { ; GFX950-LABEL: v_fdot2_f32_bf16_abs_c_dual: ; GFX950: ; %bb.0: -; GFX950: v_dot2_f32_bf16 v0, v0, v1, v2 neg_hi:[0,0,1] +; GFX950: v_dot2_f32_bf16 v2, v0, v1, v2 neg_hi:[0,0,1] ; GFX950: v_dot2c_f32_bf16_e32 v5, v3, v4 -; GFX950: v_add_f32_e32 v0, v0, v5 +; GFX950: v_add_f32_e32 v0, v2, v5 ; ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_abs_c_dual: ; GFX11PLUS: ; %bb.0: -; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 neg_hi:[0,0,1] -; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5 -; GFX11PLUS: v_add_f32_e32 v0, v0, v1 +; GFX11PLUS: v_dot2_f32_bf16 v2, v0, v1, v2 neg_hi:[0,0,1] +; GFX11PLUS: v_dot2_f32_bf16 v5, v3, v4, v5 +; GFX11PLUS: v_add_f32_e32 v0, v2, v5 %abs.c = call float @llvm.fabs.f32(float %c) %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %abs.c, i1 false) %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false) @@ -594,9 +697,8 @@ define float @v_fdot2_f32_bf16_opsel_lo_a_dual(<2 x bfloat> %a, <2 x bfloat> %b, ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_opsel_lo_a_dual: ; GFX11PLUS: ; %bb.0: ; GFX11PLUS: v_mov_b16_e32 v0.l, v0.h -; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 -; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5 -; GFX11PLUS: v_add_f32_e32 v0, v0, v1 +; GFX11PLUS: v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1 +; GFX11PLUS: v_add_f32_e32 v0, v2, v5 %shuf = shufflevector <2 x bfloat> %a, <2 x bfloat> poison, <2 x i32> <i32 1, i32 1> %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %shuf, <2 x bfloat> %b, float %c, i1 false) %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false) @@ -616,9 +718,8 @@ define float @v_fdot2_f32_bf16_opsel_hi_a_dual(<2 x bfloat> %a, <2 x bfloat> %b, ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_opsel_hi_a_dual: ; GFX11PLUS: ; %bb.0: ; GFX11PLUS: v_mov_b16_e32 v0.h, v0.l -; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 -; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5 -; GFX11PLUS: v_add_f32_e32 v0, v0, v1 +; GFX11PLUS: v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1 +; GFX11PLUS: v_add_f32_e32 v0, v2, v5 %shuf = shufflevector <2 x bfloat> %a, <2 x bfloat> poison, <2 x i32> <i32 0, i32 0> %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %shuf, <2 x bfloat> %b, float %c, i1 false) %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false) @@ -638,9 +739,8 @@ define float @v_fdot2_f32_bf16_opsel_lo_b_dual(<2 x bfloat> %a, <2 x bfloat> %b, ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_opsel_lo_b_dual: ; GFX11PLUS: ; %bb.0: ; GFX11PLUS: v_mov_b16_e32 v1.l, v1.h -; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 -; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5 -; GFX11PLUS: v_add_f32_e32 v0, v0, v1 +; GFX11PLUS: v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1 +; GFX11PLUS: v_add_f32_e32 v0, v2, v5 %shuf = shufflevector <2 x bfloat> %b, <2 x bfloat> poison, <2 x i32> <i32 1, i32 1> %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %shuf, float %c, i1 false) %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false) @@ -660,9 +760,8 @@ define float @v_fdot2_f32_bf16_opsel_hi_b_dual(<2 x bfloat> %a, <2 x bfloat> %b, ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_opsel_hi_b_dual: ; GFX11PLUS: ; %bb.0: ; GFX11PLUS: v_mov_b16_e32 v1.h, v1.l -; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 -; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5 -; GFX11PLUS: v_add_f32_e32 v0, v0, v1 +; GFX11PLUS: v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1 +; GFX11PLUS: v_add_f32_e32 v0, v2, v5 %shuf = shufflevector <2 x bfloat> %b, <2 x bfloat> poison, <2 x i32> <i32 0, i32 0> %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %shuf, float %c, i1 false) %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false) @@ -682,9 +781,8 @@ define float @v_fdot2_f32_bf16_inline_literal_a_y(<2 x bfloat> %a, <2 x bfloat> ; ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_a_y: ; GFX11PLUS: ; %bb.0: -; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 -; GFX11PLUS: v_dot2_f32_bf16 v1, 0x40004000, v4, v5 -; GFX11PLUS: v_add_f32_e32 v0, v0, v1 +; GFX11PLUS: v_dual_dot2acc_f32_bf16 v2, v0, v1 :: v_dual_dot2acc_f32_bf16 v5, 0x40004000, v4 +; GFX11PLUS: v_add_f32_e32 v0, v2, v5 %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %c, i1 false) %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> <bfloat 2.0, bfloat 2.0>, <2 x bfloat> %e, float %f, i1 false) %r = fadd float %r0, %r1 @@ -700,9 +798,8 @@ define float @v_fdot2_f32_bf16_inline_literal_a_xy(<2 x bfloat> %a, <2 x bfloat> ; ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_a_xy: ; GFX11PLUS: ; %bb.0: -; GFX11PLUS: v_dot2_f32_bf16 v0, 0x40004000, v1, v2 -; GFX11PLUS: v_dot2_f32_bf16 v1, 0x40004000, v4, v5 -; GFX11PLUS: v_add_f32_e32 v0, v0, v1 +; GFX11PLUS: v_dual_dot2acc_f32_bf16 v2, 0x40004000, v1 :: v_dual_dot2acc_f32_bf16 v5, 0x40004000, v4 +; GFX11PLUS: v_add_f32_e32 v0, v2, v5 %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> <bfloat 2.0, bfloat 2.0>, <2 x bfloat> %b, float %c, i1 false) %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> <bfloat 2.0, bfloat 2.0>, <2 x bfloat> %e, float %f, i1 false) %r = fadd float %r0, %r1 @@ -718,9 +815,8 @@ define float @v_fdot2_f32_bf16_inline_literal_b_x(<2 x bfloat> %a, <2 x bfloat> ; ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_b_x: ; GFX11PLUS: ; %bb.0: -; GFX11PLUS: v_dot2_f32_bf16 v0, v0, 0x40004000, v2 -; GFX11PLUS: v_dot2_f32_bf16 v1, v4, v3, v5 -; GFX11PLUS: v_add_f32_e32 v0, v0, v1 +; GFX11PLUS: v_dual_dot2acc_f32_bf16 v2, 0x40004000, v0 :: v_dual_dot2acc_f32_bf16 v5, v4, v3 +; GFX11PLUS: v_add_f32_e32 v0, v2, v5 %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> <bfloat 2.0, bfloat 2.0>, float %c, i1 false) %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %e, <2 x bfloat> %d, float %f, i1 false) %r = fadd float %r0, %r1 @@ -736,9 +832,8 @@ define float @v_fdot2_f32_bf16_inline_literal_b_y(<2 x bfloat> %a, <2 x bfloat> ; ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_b_y: ; GFX11PLUS: ; %bb.0: -; GFX11PLUS: v_dot2_f32_bf16 v0, v1, v0, v2 -; GFX11PLUS: v_dot2_f32_bf16 v1, v3, 0x40004000, v5 -; GFX11PLUS: v_add_f32_e32 v0, v0, v1 +; GFX11PLUS: v_dual_dot2acc_f32_bf16 v2, v1, v0 :: v_dual_dot2acc_f32_bf16 v5, 0x40004000, v3 +; GFX11PLUS: v_add_f32_e32 v0, v2, v5 %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %b, <2 x bfloat> %a, float %c, i1 false) %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> <bfloat 2.0, bfloat 2.0>, float %f, i1 false) %r = fadd float %r0, %r1 @@ -754,9 +849,8 @@ define float @v_fdot2_f32_bf16_inline_literal_b_xy(<2 x bfloat> %a, <2 x bfloat> ; ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_b_xy: ; GFX11PLUS: ; %bb.0: -; GFX11PLUS: v_dot2_f32_bf16 v0, v0, 0x40004000, v2 -; GFX11PLUS: v_dot2_f32_bf16 v1, v3, 0x40004000, v5 -; GFX11PLUS: v_add_f32_e32 v0, v0, v1 +; GFX11PLUS: v_dual_dot2acc_f32_bf16 v2, 0x40004000, v0 :: v_dual_dot2acc_f32_bf16 v5, 0x40004000, v3 +; GFX11PLUS: v_add_f32_e32 v0, v2, v5 %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> <bfloat 2.0, bfloat 2.0>, float %c, i1 false) %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> <bfloat 2.0, bfloat 2.0>, float %f, i1 false) %r = fadd float %r0, %r1 @@ -774,8 +868,8 @@ define float @v_fdot2_f32_bf16_inline_literal_c_dual(<2 x bfloat> %a, <2 x bfloa ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_c_dual: ; GFX11PLUS: ; %bb.0: ; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, 2.0 -; GFX11PLUS: v_dot2_f32_bf16 v1, v2, v3, v4 -; GFX11PLUS: v_add_f32_e32 v0, v0, v1 +; GFX11PLUS: v_dot2_f32_bf16 v4, v2, v3, v4 +; GFX11PLUS: v_add_f32_e32 v0, v0, v4 %r0 = tail call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float 2.0, i1 false) %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false) %r = fadd float %r0, %r1 @@ -785,9 +879,9 @@ define float @v_fdot2_f32_bf16_inline_literal_c_dual(<2 x bfloat> %a, <2 x bfloa define float @v_fdot2_f32_bf16_clamp_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) { ; GCN-LABEL: v_fdot2_f32_bf16_clamp_dual: ; GCN: ; %bb.0: -; GCN: v_dot2_f32_bf16 v0, v0, v1, v2 clamp -; GCN: v_dot2_f32_bf16 v1, v3, v4, v5 clamp -; GCN: v_add_f32_e32 v0, v0, v1 +; GCN: v_dot2_f32_bf16 v2, v0, v1, v2 clamp +; GCN: v_dot2_f32_bf16 v5, v3, v4, v5 clamp +; GCN: v_add_f32_e32 v0, v2, v5 %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %c, i1 true) %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 true) %r = fadd float %r0, %r1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll index a16cc091eb766..007757a643535 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll @@ -3,8 +3,8 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck %s --check-prefixes=GCN,GFX950 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck %s --check-prefixes=GCN,GFX10 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s --check-prefixes=GCN,GFX11PLUS,GFX11 -; RUN: llc -mtriple=amdgcn -mcpu=gfx1170 < %s | FileCheck %s --check-prefixes=GCN,GFX11PLUS,GFX1170 -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefixes=GCN,GFX11PLUS,GFX12 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1170 < %s | FileCheck %s --check-prefixes=GCN,GFX11PLUS,GFX1170-GFX12 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck %s --check-prefixes=GCN,GFX11PLUS,GFX1170-GFX12 declare float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, i1 %clamp) @@ -28,13 +28,9 @@ define float @v_fdot2(<2 x half> %a, <2 x half> %b, float %c) { ; GFX11: v_dot2acc_f32_f16 v2, v0, v1 ; GFX11: v_mov_b32_e32 v0, v2 ; -; GFX1170-LABEL: v_fdot2: -; GFX1170: ; %bb.0: -; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 -; -; GFX12-LABEL: v_fdot2: -; GFX12: ; %bb.0: -; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 +; GFX1170-GFX12-LABEL: v_fdot2: +; GFX1170-GFX12: ; %bb.0: +; GFX1170-GFX12: v_dot2_f32_f16 v0, v0, v1, v2 %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, i1 false) ret float %r } @@ -71,15 +67,10 @@ define float @v_fdot2_neg_a_lo(<2 x half> %a, <2 x half> %b, float %c) { ; GFX11: v_dot2acc_f32_f16 v2, v0, v1 ; GFX11: v_mov_b32_e32 v0, v2 ; -; GFX1170-LABEL: v_fdot2_neg_a_lo: -; GFX1170: ; %bb.0: -; GFX1170: v_xor_b16 v0.l, 0x8000, v0.l -; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 -; -; GFX12-LABEL: v_fdot2_neg_a_lo: -; GFX12: ; %bb.0: -; GFX12: v_xor_b16 v0.l, 0x8000, v0.l -; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 +; GFX1170-GFX12-LABEL: v_fdot2_neg_a_lo: +; GFX1170-GFX12: ; %bb.0: +; GFX1170-GFX12: v_xor_b16 v0.l, 0x8000, v0.l +; GFX1170-GFX12: v_dot2_f32_f16 v0, v0, v1, v2 %a_lo = extractelement <2 x half> %a, i32 0 %neg.a_lo = fneg half %a_lo %neg_lo.a = insertelement <2 x half> %a, half %neg.a_lo, i32 0 @@ -111,15 +102,10 @@ define float @v_fdot2_neg_a_hi(<2 x half> %a, <2 x half> %b, float %c) { ; GFX11: v_dot2acc_f32_f16 v2, v0, v1 ; GFX11: v_mov_b32_e32 v0, v2 ; -; GFX1170-LABEL: v_fdot2_neg_a_hi: -; GFX1170: ; %bb.0: -; GFX1170: v_xor_b16 v0.h, 0x8000, v0.h -; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 -; -; GFX12-LABEL: v_fdot2_neg_a_hi: -; GFX12: ; %bb.0: -; GFX12: v_xor_b16 v0.h, 0x8000, v0.h -; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 +; GFX1170-GFX12-LABEL: v_fdot2_neg_a_hi: +; GFX1170-GFX12: ; %bb.0: +; GFX1170-GFX12: v_xor_b16 v0.h, 0x8000, v0.h +; GFX1170-GFX12: v_dot2_f32_f16 v0, v0, v1, v2 %a_hi = extractelement <2 x half> %a, i32 1 %neg.a_hi = fneg half %a_hi %neg_hi.a = insertelement <2 x half> %a, half %neg.a_hi, i32 1 @@ -159,15 +145,10 @@ define float @v_fdot2_neg_b_lo(<2 x half> %a, <2 x half> %b, float %c) { ; GFX11: v_dot2acc_f32_f16 v2, v0, v1 ; GFX11: v_mov_b32_e32 v0, v2 ; -; GFX1170-LABEL: v_fdot2_neg_b_lo: -; GFX1170: ; %bb.0: -; GFX1170: v_xor_b16 v1.l, 0x8000, v1.l -; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 -; -; GFX12-LABEL: v_fdot2_neg_b_lo: -; GFX12: ; %bb.0: -; GFX12: v_xor_b16 v1.l, 0x8000, v1.l -; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 +; GFX1170-GFX12-LABEL: v_fdot2_neg_b_lo: +; GFX1170-GFX12: ; %bb.0: +; GFX1170-GFX12: v_xor_b16 v1.l, 0x8000, v1.l +; GFX1170-GFX12: v_dot2_f32_f16 v0, v0, v1, v2 %b_lo = extractelement <2 x half> %b, i32 0 %neg.b_lo = fneg half %b_lo %neg_lo.b = insertelement <2 x half> %b, half %neg.b_lo, i32 0 @@ -199,15 +180,10 @@ define float @v_fdot2_neg_b_hi(<2 x half> %a, <2 x half> %b, float %c) { ; GFX11: v_dot2acc_f32_f16 v2, v0, v1 ; GFX11: v_mov_b32_e32 v0, v2 ; -; GFX1170-LABEL: v_fdot2_neg_b_hi: -; GFX1170: ; %bb.0: -; GFX1170: v_xor_b16 v1.h, 0x8000, v1.h -; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 -; -; GFX12-LABEL: v_fdot2_neg_b_hi: -; GFX12: ; %bb.0: -; GFX12: v_xor_b16 v1.h, 0x8000, v1.h -; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 +; GFX1170-GFX12-LABEL: v_fdot2_neg_b_hi: +; GFX1170-GFX12: ; %bb.0: +; GFX1170-GFX12: v_xor_b16 v1.h, 0x8000, v1.h +; GFX1170-GFX12: v_dot2_f32_f16 v0, v0, v1, v2 %b_hi = extractelement <2 x half> %b, i32 1 %neg.b_hi = fneg half %b_hi %neg_hi.b = insertelement <2 x half> %b, half %neg.b_hi, i32 1 @@ -255,15 +231,10 @@ define float @v_fdot2_opsel_lo_a(<2 x half> %a, <2 x half> %b, float %c) { ; GFX11: v_dot2acc_f32_f16 v2, v0, v1 ; GFX11: v_mov_b32_e32 v0, v2 ; -; GFX1170-LABEL: v_fdot2_opsel_lo_a: -; GFX1170: ; %bb.0: -; GFX1170: v_mov_b16_e32 v0.l, v0.h -; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 -; -; GFX12-LABEL: v_fdot2_opsel_lo_a: -; GFX12: ; %bb.0: -; GFX12: v_mov_b16_e32 v0.l, v0.h -; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 +; GFX1170-GFX12-LABEL: v_fdot2_opsel_lo_a: +; GFX1170-GFX12: ; %bb.0: +; GFX1170-GFX12: v_mov_b16_e32 v0.l, v0.h +; GFX1170-GFX12: v_dot2_f32_f16 v0, v0, v1, v2 %shuf = shufflevector <2 x half> %a, <2 x half> poison, <2 x i32> <i32 1, i32 1> %r = call float @llvm.amdgcn.fdot2(<2 x half> %shuf, <2 x half> %b, float %c, i1 false) ret float %r @@ -291,15 +262,10 @@ define float @v_fdot2_opsel_hi_a(<2 x half> %a, <2 x half> %b, float %c) { ; GFX11: v_dot2acc_f32_f16 v2, v0, v1 ; GFX11: v_mov_b32_e32 v0, v2 ; -; GFX1170-LABEL: v_fdot2_opsel_hi_a: -; GFX1170: ; %bb.0: -; GFX1170: v_mov_b16_e32 v0.h, v0.l -; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 -; -; GFX12-LABEL: v_fdot2_opsel_hi_a: -; GFX12: ; %bb.0: -; GFX12: v_mov_b16_e32 v0.h, v0.l -; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 +; GFX1170-GFX12-LABEL: v_fdot2_opsel_hi_a: +; GFX1170-GFX12: ; %bb.0: +; GFX1170-GFX12: v_mov_b16_e32 v0.h, v0.l +; GFX1170-GFX12: v_dot2_f32_f16 v0, v0, v1, v2 %shuf = shufflevector <2 x half> %a, <2 x half> poison, <2 x i32> <i32 0, i32 0> %r = call float @llvm.amdgcn.fdot2(<2 x half> %shuf, <2 x half> %b, float %c, i1 false) ret float %r @@ -327,15 +293,10 @@ define float @v_fdot2_opsel_lo_b(<2 x half> %a, <2 x half> %b, float %c) { ; GFX11: v_dot2acc_f32_f16 v2, v0, v1 ; GFX11: v_mov_b32_e32 v0, v2 ; -; GFX1170-LABEL: v_fdot2_opsel_lo_b: -; GFX1170: ; %bb.0: -; GFX1170: v_mov_b16_e32 v1.l, v1.h -; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 -; -; GFX12-LABEL: v_fdot2_opsel_lo_b: -; GFX12: ; %bb.0: -; GFX12: v_mov_b16_e32 v1.l, v1.h -; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 +; GFX1170-GFX12-LABEL: v_fdot2_opsel_lo_b: +; GFX1170-GFX12: ; %bb.0: +; GFX1170-GFX12: v_mov_b16_e32 v1.l, v1.h +; GFX1170-GFX12: v_dot2_f32_f16 v0, v0, v1, v2 %shuf = shufflevector <2 x half> %b, <2 x half> poison, <2 x i32> <i32 1, i32 1> %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %shuf, float %c, i1 false) ret float %r @@ -363,15 +324,10 @@ define float @v_fdot2_opsel_hi_b(<2 x half> %a, <2 x half> %b, float %c) { ; GFX11: v_dot2acc_f32_f16 v2, v0, v1 ; GFX11: v_mov_b32_e32 v0, v2 ; -; GFX1170-LABEL: v_fdot2_opsel_hi_b: -; GFX1170: ; %bb.0: -; GFX1170: v_mov_b16_e32 v1.h, v1.l -; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 -; -; GFX12-LABEL: v_fdot2_opsel_hi_b: -; GFX12: ; %bb.0: -; GFX12: v_mov_b16_e32 v1.h, v1.l -; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 +; GFX1170-GFX12-LABEL: v_fdot2_opsel_hi_b: +; GFX1170-GFX12: ; %bb.0: +; GFX1170-GFX12: v_mov_b16_e32 v1.h, v1.l +; GFX1170-GFX12: v_dot2_f32_f16 v0, v0, v1, v2 %shuf = shufflevector <2 x half> %b, <2 x half> poison, <2 x i32> <i32 0, i32 0> %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %shuf, float %c, i1 false) ret float %r @@ -397,13 +353,9 @@ define float @v_fdot2_inline_literal_a(<2 x half> %b, float %c) { ; GFX11: v_dot2acc_f32_f16 v1, 0x40004000, v0 ; GFX11: v_mov_b32_e32 v0, v1 ; -; GFX1170-LABEL: v_fdot2_inline_literal_a: -; GFX1170: ; %bb.0: -; GFX1170: v_dot2_f32_f16 v0, 0x40004000, v0, v1 -; -; GFX12-LABEL: v_fdot2_inline_literal_a: -; GFX12: ; %bb.0: -; GFX12: v_dot2_f32_f16 v0, 0x40004000, v0, v1 +; GFX1170-GFX12-LABEL: v_fdot2_inline_literal_a: +; GFX1170-GFX12: ; %bb.0: +; GFX1170-GFX12: v_dot2_f32_f16 v0, 0x40004000, v0, v1 %ret = tail call float @llvm.amdgcn.fdot2(<2 x half> <half 2.0, half 2.0>, <2 x half> %b, float %c, i1 false) ret float %ret } @@ -428,13 +380,9 @@ define float @v_fdot2_inline_literal_b(<2 x half> %a, float %c) { ; GFX11: v_dot2acc_f32_f16 v1, 0x40004000, v0 ; GFX11: v_mov_b32_e32 v0, v1 ; -; GFX1170-LABEL: v_fdot2_inline_literal_b: -; GFX1170: ; %bb.0: -; GFX1170: v_dot2_f32_f16 v0, v0, 0x40004000, v1 -; -; GFX12-LABEL: v_fdot2_inline_literal_b: -; GFX12: ; %bb.0: -; GFX12: v_dot2_f32_f16 v0, v0, 0x40004000, v1 +; GFX1170-GFX12-LABEL: v_fdot2_inline_literal_b: +; GFX1170-GFX12: ; %bb.0: +; GFX1170-GFX12: v_dot2_f32_f16 v0, 0x40004000, v0, v1 %ret = tail call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> <half 2.0, half 2.0>, float %c, i1 false) ret float %ret } @@ -462,13 +410,9 @@ define float @v_fdot2_inline_literal_c(<2 x half> %a, <2 x half> %b) { ; GFX11: v_dot2acc_f32_f16 v2, v0, v1 ; GFX11: v_mov_b32_e32 v0, v2 ; -; GFX1170-LABEL: v_fdot2_inline_literal_c: -; GFX1170: ; %bb.0: -; GFX1170: v_dot2_f32_f16 v0, v0, v1, 2.0 -; -; GFX12-LABEL: v_fdot2_inline_literal_c: -; GFX12: ; %bb.0: -; GFX12: v_dot2_f32_f16 v0, v0, v1, 2.0 +; GFX1170-GFX12-LABEL: v_fdot2_inline_literal_c: +; GFX1170-GFX12: ; %bb.0: +; GFX1170-GFX12: v_dot2_f32_f16 v0, v0, v1, 2.0 %ret = tail call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float 2.0, i1 false) ret float %ret } @@ -650,7 +594,7 @@ define float @v_fdot2_inline_literal_b_clamp(<2 x half> %a, float %c) { ; ; GFX11PLUS-LABEL: v_fdot2_inline_literal_b_clamp: ; GFX11PLUS: ; %bb.0: -; GFX11PLUS: v_dot2_f32_f16 v0, v0, 0x40004000, v1 clamp +; GFX11PLUS: v_dot2_f32_f16 v0, 0x40004000, v0, v1 clamp %ret = tail call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> <half 2.0, half 2.0>, float %c, i1 true) ret float %ret } @@ -666,9 +610,9 @@ define float @v_fdot2_inline_literal_c_clamp(<2 x half> %a, <2 x half> %b) { define float @v_fdot2_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) { ; GFX906-LABEL: v_fdot2_dual: ; GFX906: ; %bb.0: -; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 -; GFX906: v_dot2_f32_f16 v1, v3, v4, v5 -; GFX906: v_add_f32_e32 v0, v0, v1 +; GFX906: v_dot2_f32_f16 v2, v0, v1, v2 +; GFX906: v_dot2_f32_f16 v5, v3, v4, v5 +; GFX906: v_add_f32_e32 v0, v2, v5 ; ; GFX950-LABEL: v_fdot2_dual: ; GFX950: ; %bb.0: @@ -682,22 +626,214 @@ define float @v_fdot2_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d ; GFX10: v_dot2c_f32_f16 v5, v3, v4 ; GFX10: v_add_f32_e32 v0, v2, v5 ; -; GFX11-LABEL: v_fdot2_dual: +; GFX11PLUS-LABEL: v_fdot2_dual: +; GFX11PLUS: ; %bb.0: +; GFX11PLUS: v_dual_dot2acc_f32_f16 v2, v0, v1 :: v_dual_dot2acc_f32_f16 v5, v3, v4 +; GFX11PLUS: v_add_f32_e32 v0, v2, v5 + %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, i1 false) + %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false) + %r = fadd float %r0, %r1 + ret float %r +} + +define float @v_fdot2_dual_sgpr_src0_x(<2 x half> inreg %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) { +; GFX906-LABEL: v_fdot2_dual_sgpr_src0_x: +; GFX906: ; %bb.0: +; GFX906: v_dot2_f32_f16 v1, s16, v0, v1 +; GFX906: v_dot2_f32_f16 v4, v2, v3, v4 +; GFX906: v_add_f32_e32 v0, v1, v4 +; +; GFX950-LABEL: v_fdot2_dual_sgpr_src0_x: +; GFX950: ; %bb.0: +; GFX950: v_dot2c_f32_f16_e32 v1, s0, v0 +; GFX950: v_dot2c_f32_f16_e32 v4, v2, v3 +; GFX950: v_add_f32_e32 v0, v1, v4 +; +; GFX10-LABEL: v_fdot2_dual_sgpr_src0_x: +; GFX10: ; %bb.0: +; GFX10: v_dot2c_f32_f16 v1, s16, v0 +; GFX10: v_dot2c_f32_f16 v4, v2, v3 +; GFX10: v_add_f32_e32 v0, v1, v4 +; +; GFX11PLUS-LABEL: v_fdot2_dual_sgpr_src0_x: +; GFX11PLUS: ; %bb.0: +; GFX11PLUS: v_dual_dot2acc_f32_f16 v1, s0, v0 :: v_dual_dot2acc_f32_f16 v4, v2, v3 +; GFX11PLUS: v_add_f32_e32 v0, v1, v4 + %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, i1 false) + %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false) + %r = fadd float %r0, %r1 + ret float %r +} + +define float @v_fdot2_dual_sgpr_src1_x(<2 x half> %a, <2 x half> inreg %b, float %c, <2 x half> %d, <2 x half> %e, float %f) { +; GFX906-LABEL: v_fdot2_dual_sgpr_src1_x: +; GFX906: ; %bb.0: +; GFX906: v_dot2_f32_f16 v1, v0, s16, v1 +; GFX906: v_dot2_f32_f16 v4, v2, v3, v4 +; GFX906: v_add_f32_e32 v0, v1, v4 +; +; GFX950-LABEL: v_fdot2_dual_sgpr_src1_x: +; GFX950: ; %bb.0: +; GFX950: v_dot2c_f32_f16_e32 v1, s0, v0 +; GFX950: v_dot2c_f32_f16_e32 v4, v2, v3 +; GFX950: v_add_f32_e32 v0, v1, v4 +; +; GFX10-LABEL: v_fdot2_dual_sgpr_src1_x: +; GFX10: ; %bb.0: +; GFX10: v_dot2c_f32_f16 v1, s16, v0 +; GFX10: v_dot2c_f32_f16 v4, v2, v3 +; GFX10: v_add_f32_e32 v0, v1, v4 +; +; GFX11-LABEL: v_fdot2_dual_sgpr_src1_x: +; GFX11: ; %bb.0: +; GFX11: v_dual_dot2acc_f32_f16 v1, s0, v0 :: v_dual_dot2acc_f32_f16 v4, v2, v3 +; GFX11: v_add_f32_e32 v0, v1, v4 +; +; GFX1170-GFX12-LABEL: v_fdot2_dual_sgpr_src1_x: +; GFX1170-GFX12: ; %bb.0: +; GFX1170-GFX12: v_dot2_f32_f16 v1, v0, s0, v1 +; GFX1170-GFX12: v_dot2_f32_f16 v4, v2, v3, v4 +; GFX1170-GFX12: v_add_f32_e32 v0, v1, v4 + %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, i1 false) + %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false) + %r = fadd float %r0, %r1 + ret float %r +} + +define float @v_fdot2_dual_sgpr_src2_x(<2 x half> %a, <2 x half> %b, float inreg %c, <2 x half> %d, <2 x half> %e, float %f) { +; GFX906-LABEL: v_fdot2_dual_sgpr_src2_x: +; GFX906: ; %bb.0: +; GFX906: v_dot2_f32_f16 v0, v0, v1, s16 +; GFX906: v_dot2_f32_f16 v4, v2, v3, v4 +; GFX906: v_add_f32_e32 v0, v0, v4 +; +; GFX950-LABEL: v_fdot2_dual_sgpr_src2_x: +; GFX950: ; %bb.0: +; GFX950: v_mov_b32_e32 v5, s0 +; GFX950: v_dot2c_f32_f16_e32 v5, v0, v1 +; GFX950: v_dot2c_f32_f16_e32 v4, v2, v3 +; GFX950: v_add_f32_e32 v0, v5, v4 +; +; GFX10-LABEL: v_fdot2_dual_sgpr_src2_x: +; GFX10: ; %bb.0: +; GFX10: v_mov_b32_e32 v5, s16 +; GFX10: v_dot2c_f32_f16 v4, v2, v3 +; GFX10: v_dot2c_f32_f16 v5, v0, v1 +; GFX10: v_add_f32_e32 v0, v5, v4 +; +; GFX11-LABEL: v_fdot2_dual_sgpr_src2_x: +; GFX11: ; %bb.0: +; GFX11: v_dual_mov_b32 v5, s0 :: v_dual_dot2acc_f32_f16 v4, v2, v3 +; GFX11: v_dot2acc_f32_f16 v5, v0, v1 +; GFX11: v_add_f32_e32 v0, v5, v4 +; +; GFX1170-GFX12-LABEL: v_fdot2_dual_sgpr_src2_x: +; GFX1170-GFX12: ; %bb.0: +; GFX1170-GFX12: v_dot2_f32_f16 v0, v0, v1, s0 +; GFX1170-GFX12: v_dot2_f32_f16 v4, v2, v3, v4 +; GFX1170-GFX12: v_add_f32_e32 v0, v0, v4 + %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, i1 false) + %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false) + %r = fadd float %r0, %r1 + ret float %r +} + +define float @v_fdot2_dual_sgpr_src0_y(<2 x half> %a, <2 x half> %b, float %c, <2 x half> inreg %d, <2 x half> %e, <2 x half> %vopd_dst_pad, float %f) { +; GFX906-LABEL: v_fdot2_dual_sgpr_src0_y: +; GFX906: ; %bb.0: +; GFX906: v_dot2_f32_f16 v2, v0, v1, v2 +; GFX906: v_dot2_f32_f16 v5, s16, v3, v5 +; GFX906: v_add_f32_e32 v0, v2, v5 +; +; GFX950-LABEL: v_fdot2_dual_sgpr_src0_y: +; GFX950: ; %bb.0: +; GFX950: v_dot2c_f32_f16_e32 v2, v0, v1 +; GFX950: v_dot2c_f32_f16_e32 v5, s0, v3 +; GFX950: v_add_f32_e32 v0, v2, v5 +; +; GFX10-LABEL: v_fdot2_dual_sgpr_src0_y: +; GFX10: ; %bb.0: +; GFX10: v_dot2c_f32_f16 v2, v0, v1 +; GFX10: v_dot2c_f32_f16 v5, s16, v3 +; GFX10: v_add_f32_e32 v0, v2, v5 +; +; GFX11PLUS-LABEL: v_fdot2_dual_sgpr_src0_y: +; GFX11PLUS: ; %bb.0: +; GFX11PLUS: v_dual_dot2acc_f32_f16 v2, v0, v1 :: v_dual_dot2acc_f32_f16 v5, s0, v3 +; GFX11PLUS: v_add_f32_e32 v0, v2, v5 + %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, i1 false) + %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false) + %r = fadd float %r0, %r1 + ret float %r +} + +define float @v_fdot2_dual_sgpr_src1_y(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %vopd_dst_pad, <2 x half> inreg %e, float %f) { +; GFX906-LABEL: v_fdot2_dual_sgpr_src1_y: +; GFX906: ; %bb.0: +; GFX906: v_dot2_f32_f16 v2, v0, v1, v2 +; GFX906: v_dot2_f32_f16 v5, v3, s16, v5 +; GFX906: v_add_f32_e32 v0, v2, v5 +; +; GFX950-LABEL: v_fdot2_dual_sgpr_src1_y: +; GFX950: ; %bb.0: +; GFX950: v_dot2c_f32_f16_e32 v2, v0, v1 +; GFX950: v_dot2c_f32_f16_e32 v5, s0, v3 +; GFX950: v_add_f32_e32 v0, v2, v5 +; +; GFX10-LABEL: v_fdot2_dual_sgpr_src1_y: +; GFX10: ; %bb.0: +; GFX10: v_dot2c_f32_f16 v2, v0, v1 +; GFX10: v_dot2c_f32_f16 v5, s16, v3 +; GFX10: v_add_f32_e32 v0, v2, v5 +; +; GFX11-LABEL: v_fdot2_dual_sgpr_src1_y: +; GFX11: ; %bb.0: +; GFX11: v_dual_dot2acc_f32_f16 v2, v0, v1 :: v_dual_dot2acc_f32_f16 v5, s0, v3 +; GFX11: v_add_f32_e32 v0, v2, v5 +; +; GFX1170-GFX12-LABEL: v_fdot2_dual_sgpr_src1_y: +; GFX1170-GFX12: ; %bb.0: +; GFX1170-GFX12: v_dot2_f32_f16 v2, v0, v1, v2 +; GFX1170-GFX12: v_dot2_f32_f16 v5, v3, s0, v5 +; GFX1170-GFX12: v_add_f32_e32 v0, v2, v5 + %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, i1 false) + %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false) + %r = fadd float %r0, %r1 + ret float %r +} + +define float @v_fdot2_dual_sgpr_src2_y(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float inreg %f) { +; GFX906-LABEL: v_fdot2_dual_sgpr_src2_y: +; GFX906: ; %bb.0: +; GFX906: v_dot2_f32_f16 v2, v0, v1, v2 +; GFX906: v_dot2_f32_f16 v0, v3, v4, s16 +; GFX906: v_add_f32_e32 v0, v2, v0 +; +; GFX950-LABEL: v_fdot2_dual_sgpr_src2_y: +; GFX950: ; %bb.0: +; GFX950: v_dot2c_f32_f16_e32 v2, v0, v1 +; GFX950: v_mov_b32_e32 v0, s0 +; GFX950: v_dot2c_f32_f16_e32 v0, v3, v4 +; GFX950: v_add_f32_e32 v0, v2, v0 +; +; GFX10-LABEL: v_fdot2_dual_sgpr_src2_y: +; GFX10: ; %bb.0: +; GFX10: v_mov_b32_e32 v5, s16 +; GFX10: v_dot2c_f32_f16 v2, v0, v1 +; GFX10: v_dot2c_f32_f16 v5, v3, v4 +; GFX10: v_add_f32_e32 v0, v2, v5 +; +; GFX11-LABEL: v_fdot2_dual_sgpr_src2_y: ; GFX11: ; %bb.0: -; GFX11: v_dual_dot2acc_f32_f16 v2, v0, v1 :: v_dual_dot2acc_f32_f16 v5, v3, v4 +; GFX11: v_dual_mov_b32 v5, s0 :: v_dual_dot2acc_f32_f16 v2, v0, v1 +; GFX11: v_dot2acc_f32_f16 v5, v3, v4 ; GFX11: v_add_f32_e32 v0, v2, v5 ; -; GFX1170-LABEL: v_fdot2_dual: -; GFX1170: ; %bb.0: -; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 -; GFX1170: v_dot2_f32_f16 v1, v3, v4, v5 -; GFX1170: v_add_f32_e32 v0, v0, v1 -; -; GFX12-LABEL: v_fdot2_dual: -; GFX12: ; %bb.0: -; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 -; GFX12: v_dot2_f32_f16 v1, v3, v4, v5 -; GFX12: v_add_f32_e32 v0, v0, v1 +; GFX1170-GFX12-LABEL: v_fdot2_dual_sgpr_src2_y: +; GFX1170-GFX12: ; %bb.0: +; GFX1170-GFX12: v_dot2_f32_f16 v2, v0, v1, v2 +; GFX1170-GFX12: v_dot2_f32_f16 v0, v3, v4, s0 +; GFX1170-GFX12: v_add_f32_e32 v0, v2, v0 %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, i1 false) %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false) %r = fadd float %r0, %r1 @@ -707,39 +843,33 @@ define float @v_fdot2_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d define float @v_fdot2_neg_a_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) { ; GFX906-LABEL: v_fdot2_neg_a_dual: ; GFX906: ; %bb.0: -; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] -; GFX906: v_dot2_f32_f16 v1, v3, v4, v5 -; GFX906: v_add_f32_e32 v0, v0, v1 +; GFX906: v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX906: v_dot2_f32_f16 v5, v3, v4, v5 +; GFX906: v_add_f32_e32 v0, v2, v5 ; ; GFX950-LABEL: v_fdot2_neg_a_dual: ; GFX950: ; %bb.0: -; GFX950: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX950: v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX950: v_dot2c_f32_f16_e32 v5, v3, v4 -; GFX950: v_add_f32_e32 v0, v0, v5 +; GFX950: v_add_f32_e32 v0, v2, v5 ; ; GFX10-LABEL: v_fdot2_neg_a_dual: ; GFX10: ; %bb.0: -; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX10: v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX10: v_dot2c_f32_f16 v5, v3, v4 -; GFX10: v_add_f32_e32 v0, v0, v5 +; GFX10: v_add_f32_e32 v0, v2, v5 ; ; GFX11-LABEL: v_fdot2_neg_a_dual: ; GFX11: ; %bb.0: -; GFX11: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX11: v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX11: v_dot2acc_f32_f16 v5, v3, v4 -; GFX11: v_add_f32_e32 v0, v0, v5 -; -; GFX1170-LABEL: v_fdot2_neg_a_dual: -; GFX1170: ; %bb.0: -; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] -; GFX1170: v_dot2_f32_f16 v1, v3, v4, v5 -; GFX1170: v_add_f32_e32 v0, v0, v1 -; -; GFX12-LABEL: v_fdot2_neg_a_dual: -; GFX12: ; %bb.0: -; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] -; GFX12: v_dot2_f32_f16 v1, v3, v4, v5 -; GFX12: v_add_f32_e32 v0, v0, v1 +; GFX11: v_add_f32_e32 v0, v2, v5 +; +; GFX1170-GFX12-LABEL: v_fdot2_neg_a_dual: +; GFX1170-GFX12: ; %bb.0: +; GFX1170-GFX12: v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX1170-GFX12: v_dot2_f32_f16 v5, v3, v4, v5 +; GFX1170-GFX12: v_add_f32_e32 v0, v2, v5 %neg.a = fneg <2 x half> %a %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %neg.a, <2 x half> %b, float %c, i1 false) %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false) @@ -750,9 +880,9 @@ define float @v_fdot2_neg_a_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x ha define float @v_fdot2_neg_a_lo_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) { ; GFX906-LABEL: v_fdot2_neg_a_lo_dual: ; GFX906: ; %bb.0: -; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] -; GFX906: v_dot2_f32_f16 v1, v3, v4, v5 -; GFX906: v_add_f32_e32 v0, v0, v1 +; GFX906: v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[1,0,0] +; GFX906: v_dot2_f32_f16 v5, v3, v4, v5 +; GFX906: v_add_f32_e32 v0, v2, v5 ; ; GFX950-LABEL: v_fdot2_neg_a_lo_dual: ; GFX950: ; %bb.0: @@ -765,29 +895,15 @@ define float @v_fdot2_neg_a_lo_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x ; ; GFX10-LABEL: v_fdot2_neg_a_lo_dual: ; GFX10: ; %bb.0: -; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] +; GFX10: v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[1,0,0] ; GFX10: v_dot2c_f32_f16 v5, v3, v4 -; GFX10: v_add_f32_e32 v0, v0, v5 -; -; GFX11-LABEL: v_fdot2_neg_a_lo_dual: -; GFX11: ; %bb.0: -; GFX11: v_xor_b16 v0.l, 0x8000, v0.l -; GFX11: v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1 -; GFX11: v_add_f32_e32 v0, v2, v5 +; GFX10: v_add_f32_e32 v0, v2, v5 ; -; GFX1170-LABEL: v_fdot2_neg_a_lo_dual: -; GFX1170: ; %bb.0: -; GFX1170: v_xor_b16 v0.l, 0x8000, v0.l -; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 -; GFX1170: v_dot2_f32_f16 v1, v3, v4, v5 -; GFX1170: v_add_f32_e32 v0, v0, v1 -; -; GFX12-LABEL: v_fdot2_neg_a_lo_dual: -; GFX12: ; %bb.0: -; GFX12: v_xor_b16 v0.l, 0x8000, v0.l -; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 -; GFX12: v_dot2_f32_f16 v1, v3, v4, v5 -; GFX12: v_add_f32_e32 v0, v0, v1 +; GFX11PLUS-LABEL: v_fdot2_neg_a_lo_dual: +; GFX11PLUS: ; %bb.0: +; GFX11PLUS: v_xor_b16 v0.l, 0x8000, v0.l +; GFX11PLUS: v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1 +; GFX11PLUS: v_add_f32_e32 v0, v2, v5 %a_lo = extractelement <2 x half> %a, i32 0 %neg.a_lo = fneg half %a_lo %neg_lo.a = insertelement <2 x half> %a, half %neg.a_lo, i32 0 @@ -800,9 +916,9 @@ define float @v_fdot2_neg_a_lo_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x define float @v_fdot2_neg_a_hi_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) { ; GFX906-LABEL: v_fdot2_neg_a_hi_dual: ; GFX906: ; %bb.0: -; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[1,0,0] -; GFX906: v_dot2_f32_f16 v1, v3, v4, v5 -; GFX906: v_add_f32_e32 v0, v0, v1 +; GFX906: v_dot2_f32_f16 v2, v0, v1, v2 neg_hi:[1,0,0] +; GFX906: v_dot2_f32_f16 v5, v3, v4, v5 +; GFX906: v_add_f32_e32 v0, v2, v5 ; ; GFX950-LABEL: v_fdot2_neg_a_hi_dual: ; GFX950: ; %bb.0: @@ -816,29 +932,15 @@ define float @v_fdot2_neg_a_hi_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x ; ; GFX10-LABEL: v_fdot2_neg_a_hi_dual: ; GFX10: ; %bb.0: -; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[1,0,0] +; GFX10: v_dot2_f32_f16 v2, v0, v1, v2 neg_hi:[1,0,0] ; GFX10: v_dot2c_f32_f16 v5, v3, v4 -; GFX10: v_add_f32_e32 v0, v0, v5 -; -; GFX11-LABEL: v_fdot2_neg_a_hi_dual: -; GFX11: ; %bb.0: -; GFX11: v_xor_b16 v0.h, 0x8000, v0.h -; GFX11: v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1 -; GFX11: v_add_f32_e32 v0, v2, v5 +; GFX10: v_add_f32_e32 v0, v2, v5 ; -; GFX1170-LABEL: v_fdot2_neg_a_hi_dual: -; GFX1170: ; %bb.0: -; GFX1170: v_xor_b16 v0.h, 0x8000, v0.h -; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 -; GFX1170: v_dot2_f32_f16 v1, v3, v4, v5 -; GFX1170: v_add_f32_e32 v0, v0, v1 -; -; GFX12-LABEL: v_fdot2_neg_a_hi_dual: -; GFX12: ; %bb.0: -; GFX12: v_xor_b16 v0.h, 0x8000, v0.h -; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 -; GFX12: v_dot2_f32_f16 v1, v3, v4, v5 -; GFX12: v_add_f32_e32 v0, v0, v1 +; GFX11PLUS-LABEL: v_fdot2_neg_a_hi_dual: +; GFX11PLUS: ; %bb.0: +; GFX11PLUS: v_xor_b16 v0.h, 0x8000, v0.h +; GFX11PLUS: v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1 +; GFX11PLUS: v_add_f32_e32 v0, v2, v5 %a_hi = extractelement <2 x half> %a, i32 1 %neg.a_hi = fneg half %a_hi %neg_hi.a = insertelement <2 x half> %a, half %neg.a_hi, i32 1 @@ -851,39 +953,33 @@ define float @v_fdot2_neg_a_hi_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x define float @v_fdot2_neg_b_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) { ; GFX906-LABEL: v_fdot2_neg_b_dual: ; GFX906: ; %bb.0: -; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] -; GFX906: v_dot2_f32_f16 v1, v3, v4, v5 -; GFX906: v_add_f32_e32 v0, v0, v1 +; GFX906: v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX906: v_dot2_f32_f16 v5, v3, v4, v5 +; GFX906: v_add_f32_e32 v0, v2, v5 ; ; GFX950-LABEL: v_fdot2_neg_b_dual: ; GFX950: ; %bb.0: -; GFX950: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX950: v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] ; GFX950: v_dot2c_f32_f16_e32 v5, v3, v4 -; GFX950: v_add_f32_e32 v0, v0, v5 +; GFX950: v_add_f32_e32 v0, v2, v5 ; ; GFX10-LABEL: v_fdot2_neg_b_dual: ; GFX10: ; %bb.0: -; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX10: v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] ; GFX10: v_dot2c_f32_f16 v5, v3, v4 -; GFX10: v_add_f32_e32 v0, v0, v5 +; GFX10: v_add_f32_e32 v0, v2, v5 ; ; GFX11-LABEL: v_fdot2_neg_b_dual: ; GFX11: ; %bb.0: -; GFX11: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX11: v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] ; GFX11: v_dot2acc_f32_f16 v5, v3, v4 -; GFX11: v_add_f32_e32 v0, v0, v5 -; -; GFX1170-LABEL: v_fdot2_neg_b_dual: -; GFX1170: ; %bb.0: -; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] -; GFX1170: v_dot2_f32_f16 v1, v3, v4, v5 -; GFX1170: v_add_f32_e32 v0, v0, v1 -; -; GFX12-LABEL: v_fdot2_neg_b_dual: -; GFX12: ; %bb.0: -; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] -; GFX12: v_dot2_f32_f16 v1, v3, v4, v5 -; GFX12: v_add_f32_e32 v0, v0, v1 +; GFX11: v_add_f32_e32 v0, v2, v5 +; +; GFX1170-GFX12-LABEL: v_fdot2_neg_b_dual: +; GFX1170-GFX12: ; %bb.0: +; GFX1170-GFX12: v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX1170-GFX12: v_dot2_f32_f16 v5, v3, v4, v5 +; GFX1170-GFX12: v_add_f32_e32 v0, v2, v5 %neg.b = fneg <2 x half> %b %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %neg.b, float %c, i1 false) %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false) @@ -894,9 +990,9 @@ define float @v_fdot2_neg_b_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x ha define float @v_fdot2_neg_b_lo_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) { ; GFX906-LABEL: v_fdot2_neg_b_lo_dual: ; GFX906: ; %bb.0: -; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] -; GFX906: v_dot2_f32_f16 v1, v3, v4, v5 -; GFX906: v_add_f32_e32 v0, v0, v1 +; GFX906: v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[0,1,0] +; GFX906: v_dot2_f32_f16 v5, v3, v4, v5 +; GFX906: v_add_f32_e32 v0, v2, v5 ; ; GFX950-LABEL: v_fdot2_neg_b_lo_dual: ; GFX950: ; %bb.0: @@ -909,29 +1005,15 @@ define float @v_fdot2_neg_b_lo_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x ; ; GFX10-LABEL: v_fdot2_neg_b_lo_dual: ; GFX10: ; %bb.0: -; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] +; GFX10: v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[0,1,0] ; GFX10: v_dot2c_f32_f16 v5, v3, v4 -; GFX10: v_add_f32_e32 v0, v0, v5 -; -; GFX11-LABEL: v_fdot2_neg_b_lo_dual: -; GFX11: ; %bb.0: -; GFX11: v_xor_b16 v1.l, 0x8000, v1.l -; GFX11: v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1 -; GFX11: v_add_f32_e32 v0, v2, v5 +; GFX10: v_add_f32_e32 v0, v2, v5 ; -; GFX1170-LABEL: v_fdot2_neg_b_lo_dual: -; GFX1170: ; %bb.0: -; GFX1170: v_xor_b16 v1.l, 0x8000, v1.l -; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 -; GFX1170: v_dot2_f32_f16 v1, v3, v4, v5 -; GFX1170: v_add_f32_e32 v0, v0, v1 -; -; GFX12-LABEL: v_fdot2_neg_b_lo_dual: -; GFX12: ; %bb.0: -; GFX12: v_xor_b16 v1.l, 0x8000, v1.l -; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 -; GFX12: v_dot2_f32_f16 v1, v3, v4, v5 -; GFX12: v_add_f32_e32 v0, v0, v1 +; GFX11PLUS-LABEL: v_fdot2_neg_b_lo_dual: +; GFX11PLUS: ; %bb.0: +; GFX11PLUS: v_xor_b16 v1.l, 0x8000, v1.l +; GFX11PLUS: v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1 +; GFX11PLUS: v_add_f32_e32 v0, v2, v5 %b_lo = extractelement <2 x half> %b, i32 0 %neg.b_lo = fneg half %b_lo %neg_lo.b = insertelement <2 x half> %b, half %neg.b_lo, i32 0 @@ -944,9 +1026,9 @@ define float @v_fdot2_neg_b_lo_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x define float @v_fdot2_neg_b_hi_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) { ; GFX906-LABEL: v_fdot2_neg_b_hi_dual: ; GFX906: ; %bb.0: -; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,1,0] -; GFX906: v_dot2_f32_f16 v1, v3, v4, v5 -; GFX906: v_add_f32_e32 v0, v0, v1 +; GFX906: v_dot2_f32_f16 v2, v0, v1, v2 neg_hi:[0,1,0] +; GFX906: v_dot2_f32_f16 v5, v3, v4, v5 +; GFX906: v_add_f32_e32 v0, v2, v5 ; ; GFX950-LABEL: v_fdot2_neg_b_hi_dual: ; GFX950: ; %bb.0: @@ -960,29 +1042,15 @@ define float @v_fdot2_neg_b_hi_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x ; ; GFX10-LABEL: v_fdot2_neg_b_hi_dual: ; GFX10: ; %bb.0: -; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,1,0] +; GFX10: v_dot2_f32_f16 v2, v0, v1, v2 neg_hi:[0,1,0] ; GFX10: v_dot2c_f32_f16 v5, v3, v4 -; GFX10: v_add_f32_e32 v0, v0, v5 -; -; GFX11-LABEL: v_fdot2_neg_b_hi_dual: -; GFX11: ; %bb.0: -; GFX11: v_xor_b16 v1.h, 0x8000, v1.h -; GFX11: v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1 -; GFX11: v_add_f32_e32 v0, v2, v5 +; GFX10: v_add_f32_e32 v0, v2, v5 ; -; GFX1170-LABEL: v_fdot2_neg_b_hi_dual: -; GFX1170: ; %bb.0: -; GFX1170: v_xor_b16 v1.h, 0x8000, v1.h -; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 -; GFX1170: v_dot2_f32_f16 v1, v3, v4, v5 -; GFX1170: v_add_f32_e32 v0, v0, v1 -; -; GFX12-LABEL: v_fdot2_neg_b_hi_dual: -; GFX12: ; %bb.0: -; GFX12: v_xor_b16 v1.h, 0x8000, v1.h -; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 -; GFX12: v_dot2_f32_f16 v1, v3, v4, v5 -; GFX12: v_add_f32_e32 v0, v0, v1 +; GFX11PLUS-LABEL: v_fdot2_neg_b_hi_dual: +; GFX11PLUS: ; %bb.0: +; GFX11PLUS: v_xor_b16 v1.h, 0x8000, v1.h +; GFX11PLUS: v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1 +; GFX11PLUS: v_add_f32_e32 v0, v2, v5 %b_hi = extractelement <2 x half> %b, i32 1 %neg.b_hi = fneg half %b_hi %neg_hi.b = insertelement <2 x half> %b, half %neg.b_hi, i32 1 @@ -995,39 +1063,33 @@ define float @v_fdot2_neg_b_hi_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x define float @v_fdot2_neg_c_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) { ; GFX906-LABEL: v_fdot2_neg_c_dual: ; GFX906: ; %bb.0: -; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] -; GFX906: v_dot2_f32_f16 v1, v3, v4, v5 -; GFX906: v_add_f32_e32 v0, v0, v1 +; GFX906: v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[0,0,1] +; GFX906: v_dot2_f32_f16 v5, v3, v4, v5 +; GFX906: v_add_f32_e32 v0, v2, v5 ; ; GFX950-LABEL: v_fdot2_neg_c_dual: ; GFX950: ; %bb.0: -; GFX950: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] +; GFX950: v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[0,0,1] ; GFX950: v_dot2c_f32_f16_e32 v5, v3, v4 -; GFX950: v_add_f32_e32 v0, v0, v5 +; GFX950: v_add_f32_e32 v0, v2, v5 ; ; GFX10-LABEL: v_fdot2_neg_c_dual: ; GFX10: ; %bb.0: -; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] +; GFX10: v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[0,0,1] ; GFX10: v_dot2c_f32_f16 v5, v3, v4 -; GFX10: v_add_f32_e32 v0, v0, v5 +; GFX10: v_add_f32_e32 v0, v2, v5 ; ; GFX11-LABEL: v_fdot2_neg_c_dual: ; GFX11: ; %bb.0: -; GFX11: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] +; GFX11: v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[0,0,1] ; GFX11: v_dot2acc_f32_f16 v5, v3, v4 -; GFX11: v_add_f32_e32 v0, v0, v5 -; -; GFX1170-LABEL: v_fdot2_neg_c_dual: -; GFX1170: ; %bb.0: -; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] -; GFX1170: v_dot2_f32_f16 v1, v3, v4, v5 -; GFX1170: v_add_f32_e32 v0, v0, v1 -; -; GFX12-LABEL: v_fdot2_neg_c_dual: -; GFX12: ; %bb.0: -; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] -; GFX12: v_dot2_f32_f16 v1, v3, v4, v5 -; GFX12: v_add_f32_e32 v0, v0, v1 +; GFX11: v_add_f32_e32 v0, v2, v5 +; +; GFX1170-GFX12-LABEL: v_fdot2_neg_c_dual: +; GFX1170-GFX12: ; %bb.0: +; GFX1170-GFX12: v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[0,0,1] +; GFX1170-GFX12: v_dot2_f32_f16 v5, v3, v4, v5 +; GFX1170-GFX12: v_add_f32_e32 v0, v2, v5 %neg.c = fneg float %c %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %neg.c, i1 false) %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false) @@ -1038,39 +1100,33 @@ define float @v_fdot2_neg_c_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x ha define float @v_fdot2_abs_c_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) { ; GFX906-LABEL: v_fdot2_abs_c_dual: ; GFX906: ; %bb.0: -; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,0,1] -; GFX906: v_dot2_f32_f16 v1, v3, v4, v5 -; GFX906: v_add_f32_e32 v0, v0, v1 +; GFX906: v_dot2_f32_f16 v2, v0, v1, v2 neg_hi:[0,0,1] +; GFX906: v_dot2_f32_f16 v5, v3, v4, v5 +; GFX906: v_add_f32_e32 v0, v2, v5 ; ; GFX950-LABEL: v_fdot2_abs_c_dual: ; GFX950: ; %bb.0: -; GFX950: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,0,1] +; GFX950: v_dot2_f32_f16 v2, v0, v1, v2 neg_hi:[0,0,1] ; GFX950: v_dot2c_f32_f16_e32 v5, v3, v4 -; GFX950: v_add_f32_e32 v0, v0, v5 +; GFX950: v_add_f32_e32 v0, v2, v5 ; ; GFX10-LABEL: v_fdot2_abs_c_dual: ; GFX10: ; %bb.0: -; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,0,1] +; GFX10: v_dot2_f32_f16 v2, v0, v1, v2 neg_hi:[0,0,1] ; GFX10: v_dot2c_f32_f16 v5, v3, v4 -; GFX10: v_add_f32_e32 v0, v0, v5 +; GFX10: v_add_f32_e32 v0, v2, v5 ; ; GFX11-LABEL: v_fdot2_abs_c_dual: ; GFX11: ; %bb.0: -; GFX11: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,0,1] +; GFX11: v_dot2_f32_f16 v2, v0, v1, v2 neg_hi:[0,0,1] ; GFX11: v_dot2acc_f32_f16 v5, v3, v4 -; GFX11: v_add_f32_e32 v0, v0, v5 -; -; GFX1170-LABEL: v_fdot2_abs_c_dual: -; GFX1170: ; %bb.0: -; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,0,1] -; GFX1170: v_dot2_f32_f16 v1, v3, v4, v5 -; GFX1170: v_add_f32_e32 v0, v0, v1 -; -; GFX12-LABEL: v_fdot2_abs_c_dual: -; GFX12: ; %bb.0: -; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,0,1] -; GFX12: v_dot2_f32_f16 v1, v3, v4, v5 -; GFX12: v_add_f32_e32 v0, v0, v1 +; GFX11: v_add_f32_e32 v0, v2, v5 +; +; GFX1170-GFX12-LABEL: v_fdot2_abs_c_dual: +; GFX1170-GFX12: ; %bb.0: +; GFX1170-GFX12: v_dot2_f32_f16 v2, v0, v1, v2 neg_hi:[0,0,1] +; GFX1170-GFX12: v_dot2_f32_f16 v5, v3, v4, v5 +; GFX1170-GFX12: v_add_f32_e32 v0, v2, v5 %abs.c = call float @llvm.fabs.f32(float %c) %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %abs.c, i1 false) %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false) @@ -1081,9 +1137,9 @@ define float @v_fdot2_abs_c_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x ha define float @v_fdot2_opsel_lo_a_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) { ; GFX906-LABEL: v_fdot2_opsel_lo_a_dual: ; GFX906: ; %bb.0: -; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 op_sel:[1,0,0] -; GFX906: v_dot2_f32_f16 v1, v3, v4, v5 -; GFX906: v_add_f32_e32 v0, v0, v1 +; GFX906: v_dot2_f32_f16 v2, v0, v1, v2 op_sel:[1,0,0] +; GFX906: v_dot2_f32_f16 v5, v3, v4, v5 +; GFX906: v_add_f32_e32 v0, v2, v5 ; ; GFX950-LABEL: v_fdot2_opsel_lo_a_dual: ; GFX950: ; %bb.0: @@ -1095,29 +1151,15 @@ define float @v_fdot2_opsel_lo_a_dual(<2 x half> %a, <2 x half> %b, float %c, <2 ; ; GFX10-LABEL: v_fdot2_opsel_lo_a_dual: ; GFX10: ; %bb.0: -; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 op_sel:[1,0,0] +; GFX10: v_dot2_f32_f16 v2, v0, v1, v2 op_sel:[1,0,0] ; GFX10: v_dot2c_f32_f16 v5, v3, v4 -; GFX10: v_add_f32_e32 v0, v0, v5 -; -; GFX11-LABEL: v_fdot2_opsel_lo_a_dual: -; GFX11: ; %bb.0: -; GFX11: v_mov_b16_e32 v0.l, v0.h -; GFX11: v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1 -; GFX11: v_add_f32_e32 v0, v2, v5 +; GFX10: v_add_f32_e32 v0, v2, v5 ; -; GFX1170-LABEL: v_fdot2_opsel_lo_a_dual: -; GFX1170: ; %bb.0: -; GFX1170: v_mov_b16_e32 v0.l, v0.h -; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 -; GFX1170: v_dot2_f32_f16 v1, v3, v4, v5 -; GFX1170: v_add_f32_e32 v0, v0, v1 -; -; GFX12-LABEL: v_fdot2_opsel_lo_a_dual: -; GFX12: ; %bb.0: -; GFX12: v_mov_b16_e32 v0.l, v0.h -; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 -; GFX12: v_dot2_f32_f16 v1, v3, v4, v5 -; GFX12: v_add_f32_e32 v0, v0, v1 +; GFX11PLUS-LABEL: v_fdot2_opsel_lo_a_dual: +; GFX11PLUS: ; %bb.0: +; GFX11PLUS: v_mov_b16_e32 v0.l, v0.h +; GFX11PLUS: v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1 +; GFX11PLUS: v_add_f32_e32 v0, v2, v5 %shuf = shufflevector <2 x half> %a, <2 x half> poison, <2 x i32> <i32 1, i32 1> %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %shuf, <2 x half> %b, float %c, i1 false) %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false) @@ -1128,9 +1170,9 @@ define float @v_fdot2_opsel_lo_a_dual(<2 x half> %a, <2 x half> %b, float %c, <2 define float @v_fdot2_opsel_hi_a_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) { ; GFX906-LABEL: v_fdot2_opsel_hi_a_dual: ; GFX906: ; %bb.0: -; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 op_sel_hi:[0,1,1] -; GFX906: v_dot2_f32_f16 v1, v3, v4, v5 -; GFX906: v_add_f32_e32 v0, v0, v1 +; GFX906: v_dot2_f32_f16 v2, v0, v1, v2 op_sel_hi:[0,1,1] +; GFX906: v_dot2_f32_f16 v5, v3, v4, v5 +; GFX906: v_add_f32_e32 v0, v2, v5 ; ; GFX950-LABEL: v_fdot2_opsel_hi_a_dual: ; GFX950: ; %bb.0: @@ -1142,29 +1184,15 @@ define float @v_fdot2_opsel_hi_a_dual(<2 x half> %a, <2 x half> %b, float %c, <2 ; ; GFX10-LABEL: v_fdot2_opsel_hi_a_dual: ; GFX10: ; %bb.0: -; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 op_sel_hi:[0,1,1] +; GFX10: v_dot2_f32_f16 v2, v0, v1, v2 op_sel_hi:[0,1,1] ; GFX10: v_dot2c_f32_f16 v5, v3, v4 -; GFX10: v_add_f32_e32 v0, v0, v5 -; -; GFX11-LABEL: v_fdot2_opsel_hi_a_dual: -; GFX11: ; %bb.0: -; GFX11: v_mov_b16_e32 v0.h, v0.l -; GFX11: v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1 -; GFX11: v_add_f32_e32 v0, v2, v5 +; GFX10: v_add_f32_e32 v0, v2, v5 ; -; GFX1170-LABEL: v_fdot2_opsel_hi_a_dual: -; GFX1170: ; %bb.0: -; GFX1170: v_mov_b16_e32 v0.h, v0.l -; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 -; GFX1170: v_dot2_f32_f16 v1, v3, v4, v5 -; GFX1170: v_add_f32_e32 v0, v0, v1 -; -; GFX12-LABEL: v_fdot2_opsel_hi_a_dual: -; GFX12: ; %bb.0: -; GFX12: v_mov_b16_e32 v0.h, v0.l -; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 -; GFX12: v_dot2_f32_f16 v1, v3, v4, v5 -; GFX12: v_add_f32_e32 v0, v0, v1 +; GFX11PLUS-LABEL: v_fdot2_opsel_hi_a_dual: +; GFX11PLUS: ; %bb.0: +; GFX11PLUS: v_mov_b16_e32 v0.h, v0.l +; GFX11PLUS: v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1 +; GFX11PLUS: v_add_f32_e32 v0, v2, v5 %shuf = shufflevector <2 x half> %a, <2 x half> poison, <2 x i32> <i32 0, i32 0> %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %shuf, <2 x half> %b, float %c, i1 false) %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false) @@ -1175,9 +1203,9 @@ define float @v_fdot2_opsel_hi_a_dual(<2 x half> %a, <2 x half> %b, float %c, <2 define float @v_fdot2_opsel_lo_b_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) { ; GFX906-LABEL: v_fdot2_opsel_lo_b_dual: ; GFX906: ; %bb.0: -; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 op_sel:[0,1,0] -; GFX906: v_dot2_f32_f16 v1, v3, v4, v5 -; GFX906: v_add_f32_e32 v0, v0, v1 +; GFX906: v_dot2_f32_f16 v2, v0, v1, v2 op_sel:[0,1,0] +; GFX906: v_dot2_f32_f16 v5, v3, v4, v5 +; GFX906: v_add_f32_e32 v0, v2, v5 ; ; GFX950-LABEL: v_fdot2_opsel_lo_b_dual: ; GFX950: ; %bb.0: @@ -1189,29 +1217,15 @@ define float @v_fdot2_opsel_lo_b_dual(<2 x half> %a, <2 x half> %b, float %c, <2 ; ; GFX10-LABEL: v_fdot2_opsel_lo_b_dual: ; GFX10: ; %bb.0: -; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 op_sel:[0,1,0] +; GFX10: v_dot2_f32_f16 v2, v0, v1, v2 op_sel:[0,1,0] ; GFX10: v_dot2c_f32_f16 v5, v3, v4 -; GFX10: v_add_f32_e32 v0, v0, v5 -; -; GFX11-LABEL: v_fdot2_opsel_lo_b_dual: -; GFX11: ; %bb.0: -; GFX11: v_mov_b16_e32 v1.l, v1.h -; GFX11: v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1 -; GFX11: v_add_f32_e32 v0, v2, v5 +; GFX10: v_add_f32_e32 v0, v2, v5 ; -; GFX1170-LABEL: v_fdot2_opsel_lo_b_dual: -; GFX1170: ; %bb.0: -; GFX1170: v_mov_b16_e32 v1.l, v1.h -; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 -; GFX1170: v_dot2_f32_f16 v1, v3, v4, v5 -; GFX1170: v_add_f32_e32 v0, v0, v1 -; -; GFX12-LABEL: v_fdot2_opsel_lo_b_dual: -; GFX12: ; %bb.0: -; GFX12: v_mov_b16_e32 v1.l, v1.h -; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 -; GFX12: v_dot2_f32_f16 v1, v3, v4, v5 -; GFX12: v_add_f32_e32 v0, v0, v1 +; GFX11PLUS-LABEL: v_fdot2_opsel_lo_b_dual: +; GFX11PLUS: ; %bb.0: +; GFX11PLUS: v_mov_b16_e32 v1.l, v1.h +; GFX11PLUS: v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1 +; GFX11PLUS: v_add_f32_e32 v0, v2, v5 %shuf = shufflevector <2 x half> %b, <2 x half> poison, <2 x i32> <i32 1, i32 1> %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %shuf, float %c, i1 false) %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false) @@ -1222,9 +1236,9 @@ define float @v_fdot2_opsel_lo_b_dual(<2 x half> %a, <2 x half> %b, float %c, <2 define float @v_fdot2_opsel_hi_b_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) { ; GFX906-LABEL: v_fdot2_opsel_hi_b_dual: ; GFX906: ; %bb.0: -; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 op_sel_hi:[1,0,1] -; GFX906: v_dot2_f32_f16 v1, v3, v4, v5 -; GFX906: v_add_f32_e32 v0, v0, v1 +; GFX906: v_dot2_f32_f16 v2, v0, v1, v2 op_sel_hi:[1,0,1] +; GFX906: v_dot2_f32_f16 v5, v3, v4, v5 +; GFX906: v_add_f32_e32 v0, v2, v5 ; ; GFX950-LABEL: v_fdot2_opsel_hi_b_dual: ; GFX950: ; %bb.0: @@ -1236,29 +1250,15 @@ define float @v_fdot2_opsel_hi_b_dual(<2 x half> %a, <2 x half> %b, float %c, <2 ; ; GFX10-LABEL: v_fdot2_opsel_hi_b_dual: ; GFX10: ; %bb.0: -; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 op_sel_hi:[1,0,1] +; GFX10: v_dot2_f32_f16 v2, v0, v1, v2 op_sel_hi:[1,0,1] ; GFX10: v_dot2c_f32_f16 v5, v3, v4 -; GFX10: v_add_f32_e32 v0, v0, v5 -; -; GFX11-LABEL: v_fdot2_opsel_hi_b_dual: -; GFX11: ; %bb.0: -; GFX11: v_mov_b16_e32 v1.h, v1.l -; GFX11: v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1 -; GFX11: v_add_f32_e32 v0, v2, v5 +; GFX10: v_add_f32_e32 v0, v2, v5 ; -; GFX1170-LABEL: v_fdot2_opsel_hi_b_dual: -; GFX1170: ; %bb.0: -; GFX1170: v_mov_b16_e32 v1.h, v1.l -; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 -; GFX1170: v_dot2_f32_f16 v1, v3, v4, v5 -; GFX1170: v_add_f32_e32 v0, v0, v1 -; -; GFX12-LABEL: v_fdot2_opsel_hi_b_dual: -; GFX12: ; %bb.0: -; GFX12: v_mov_b16_e32 v1.h, v1.l -; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 -; GFX12: v_dot2_f32_f16 v1, v3, v4, v5 -; GFX12: v_add_f32_e32 v0, v0, v1 +; GFX11PLUS-LABEL: v_fdot2_opsel_hi_b_dual: +; GFX11PLUS: ; %bb.0: +; GFX11PLUS: v_mov_b16_e32 v1.h, v1.l +; GFX11PLUS: v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1 +; GFX11PLUS: v_add_f32_e32 v0, v2, v5 %shuf = shufflevector <2 x half> %b, <2 x half> poison, <2 x i32> <i32 0, i32 0> %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %shuf, float %c, i1 false) %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false) @@ -1272,9 +1272,9 @@ define float @v_fdot2_opsel_hi_b_dual(<2 x half> %a, <2 x half> %b, float %c, <2 define float @v_fdot2_inline_literal_a_x(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) { ; GFX906-LABEL: v_fdot2_inline_literal_a_x: ; GFX906: ; %bb.0: -; GFX906: v_dot2_f32_f16 v0, 2.0, v1, v2 op_sel_hi:[0,1,1] -; GFX906: v_dot2_f32_f16 v1, v3, v4, v5 -; GFX906: v_add_f32_e32 v0, v0, v1 +; GFX906: v_dot2_f32_f16 v2, 2.0, v1, v2 op_sel_hi:[0,1,1] +; GFX906: v_dot2_f32_f16 v5, v3, v4, v5 +; GFX906: v_add_f32_e32 v0, v2, v5 ; ; GFX950-LABEL: v_fdot2_inline_literal_a_x: ; GFX950: ; %bb.0: @@ -1288,22 +1288,10 @@ define float @v_fdot2_inline_literal_a_x(<2 x half> %a, <2 x half> %b, float %c, ; GFX10: v_dot2c_f32_f16 v5, v3, v4 ; GFX10: v_add_f32_e32 v0, v2, v5 ; -; GFX11-LABEL: v_fdot2_inline_literal_a_x: -; GFX11: ; %bb.0: -; GFX11: v_dual_dot2acc_f32_f16 v2, 0x40004000, v1 :: v_dual_dot2acc_f32_f16 v5, v3, v4 -; GFX11: v_add_f32_e32 v0, v2, v5 -; -; GFX1170-LABEL: v_fdot2_inline_literal_a_x: -; GFX1170: ; %bb.0: -; GFX1170: v_dot2_f32_f16 v0, 0x40004000, v1, v2 -; GFX1170: v_dot2_f32_f16 v1, v3, v4, v5 -; GFX1170: v_add_f32_e32 v0, v0, v1 -; -; GFX12-LABEL: v_fdot2_inline_literal_a_x: -; GFX12: ; %bb.0: -; GFX12: v_dot2_f32_f16 v0, 0x40004000, v1, v2 -; GFX12: v_dot2_f32_f16 v1, v3, v4, v5 -; GFX12: v_add_f32_e32 v0, v0, v1 +; GFX11PLUS-LABEL: v_fdot2_inline_literal_a_x: +; GFX11PLUS: ; %bb.0: +; GFX11PLUS: v_dual_dot2acc_f32_f16 v2, 0x40004000, v1 :: v_dual_dot2acc_f32_f16 v5, v3, v4 +; GFX11PLUS: v_add_f32_e32 v0, v2, v5 %r0 = call float @llvm.amdgcn.fdot2(<2 x half> <half 2.0, half 2.0>, <2 x half> %b, float %c, i1 false) %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false) %r = fadd float %r0, %r1 @@ -1313,9 +1301,9 @@ define float @v_fdot2_inline_literal_a_x(<2 x half> %a, <2 x half> %b, float %c, define float @v_fdot2_inline_literal_a_y(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) { ; GFX906-LABEL: v_fdot2_inline_literal_a_y: ; GFX906: ; %bb.0: -; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 -; GFX906: v_dot2_f32_f16 v1, 2.0, v4, v5 op_sel_hi:[0,1,1] -; GFX906: v_add_f32_e32 v0, v0, v1 +; GFX906: v_dot2_f32_f16 v2, v0, v1, v2 +; GFX906: v_dot2_f32_f16 v5, 2.0, v4, v5 op_sel_hi:[0,1,1] +; GFX906: v_add_f32_e32 v0, v2, v5 ; ; GFX950-LABEL: v_fdot2_inline_literal_a_y: ; GFX950: ; %bb.0: @@ -1329,22 +1317,10 @@ define float @v_fdot2_inline_literal_a_y(<2 x half> %a, <2 x half> %b, float %c, ; GFX10: v_dot2c_f32_f16 v5, 0x40004000, v4 ; GFX10: v_add_f32_e32 v0, v2, v5 ; -; GFX11-LABEL: v_fdot2_inline_literal_a_y: -; GFX11: ; %bb.0: -; GFX11: v_dual_dot2acc_f32_f16 v2, v0, v1 :: v_dual_dot2acc_f32_f16 v5, 0x40004000, v4 -; GFX11: v_add_f32_e32 v0, v2, v5 -; -; GFX1170-LABEL: v_fdot2_inline_literal_a_y: -; GFX1170: ; %bb.0: -; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 -; GFX1170: v_dot2_f32_f16 v1, 0x40004000, v4, v5 -; GFX1170: v_add_f32_e32 v0, v0, v1 -; -; GFX12-LABEL: v_fdot2_inline_literal_a_y: -; GFX12: ; %bb.0: -; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 -; GFX12: v_dot2_f32_f16 v1, 0x40004000, v4, v5 -; GFX12: v_add_f32_e32 v0, v0, v1 +; GFX11PLUS-LABEL: v_fdot2_inline_literal_a_y: +; GFX11PLUS: ; %bb.0: +; GFX11PLUS: v_dual_dot2acc_f32_f16 v2, v0, v1 :: v_dual_dot2acc_f32_f16 v5, 0x40004000, v4 +; GFX11PLUS: v_add_f32_e32 v0, v2, v5 %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, i1 false) %r1 = call float @llvm.amdgcn.fdot2(<2 x half> <half 2.0, half 2.0>, <2 x half> %e, float %f, i1 false) %r = fadd float %r0, %r1 @@ -1354,9 +1330,9 @@ define float @v_fdot2_inline_literal_a_y(<2 x half> %a, <2 x half> %b, float %c, define float @v_fdot2_inline_literal_a_xy(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) { ; GFX906-LABEL: v_fdot2_inline_literal_a_xy: ; GFX906: ; %bb.0: -; GFX906: v_dot2_f32_f16 v0, 2.0, v1, v2 op_sel_hi:[0,1,1] -; GFX906: v_dot2_f32_f16 v1, 2.0, v4, v5 op_sel_hi:[0,1,1] -; GFX906: v_add_f32_e32 v0, v0, v1 +; GFX906: v_dot2_f32_f16 v2, 2.0, v1, v2 op_sel_hi:[0,1,1] +; GFX906: v_dot2_f32_f16 v5, 2.0, v4, v5 op_sel_hi:[0,1,1] +; GFX906: v_add_f32_e32 v0, v2, v5 ; ; GFX950-LABEL: v_fdot2_inline_literal_a_xy: ; GFX950: ; %bb.0: @@ -1370,22 +1346,10 @@ define float @v_fdot2_inline_literal_a_xy(<2 x half> %a, <2 x half> %b, float %c ; GFX10: v_dot2c_f32_f16 v5, 0x40004000, v4 ; GFX10: v_add_f32_e32 v0, v2, v5 ; -; GFX11-LABEL: v_fdot2_inline_literal_a_xy: -; GFX11: ; %bb.0: -; GFX11: v_dual_dot2acc_f32_f16 v2, 0x40004000, v1 :: v_dual_dot2acc_f32_f16 v5, 0x40004000, v4 -; GFX11: v_add_f32_e32 v0, v2, v5 -; -; GFX1170-LABEL: v_fdot2_inline_literal_a_xy: -; GFX1170: ; %bb.0: -; GFX1170: v_dot2_f32_f16 v0, 0x40004000, v1, v2 -; GFX1170: v_dot2_f32_f16 v1, 0x40004000, v4, v5 -; GFX1170: v_add_f32_e32 v0, v0, v1 -; -; GFX12-LABEL: v_fdot2_inline_literal_a_xy: -; GFX12: ; %bb.0: -; GFX12: v_dot2_f32_f16 v0, 0x40004000, v1, v2 -; GFX12: v_dot2_f32_f16 v1, 0x40004000, v4, v5 -; GFX12: v_add_f32_e32 v0, v0, v1 +; GFX11PLUS-LABEL: v_fdot2_inline_literal_a_xy: +; GFX11PLUS: ; %bb.0: +; GFX11PLUS: v_dual_dot2acc_f32_f16 v2, 0x40004000, v1 :: v_dual_dot2acc_f32_f16 v5, 0x40004000, v4 +; GFX11PLUS: v_add_f32_e32 v0, v2, v5 %r0 = call float @llvm.amdgcn.fdot2(<2 x half> <half 2.0, half 2.0>, <2 x half> %b, float %c, i1 false) %r1 = call float @llvm.amdgcn.fdot2(<2 x half> <half 2.0, half 2.0>, <2 x half> %e, float %f, i1 false) %r = fadd float %r0, %r1 @@ -1395,9 +1359,9 @@ define float @v_fdot2_inline_literal_a_xy(<2 x half> %a, <2 x half> %b, float %c define float @v_fdot2_inline_literal_b_x(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) { ; GFX906-LABEL: v_fdot2_inline_literal_b_x: ; GFX906: ; %bb.0: -; GFX906: v_dot2_f32_f16 v0, v0, 2.0, v2 op_sel_hi:[1,0,1] -; GFX906: v_dot2_f32_f16 v1, v4, v3, v5 -; GFX906: v_add_f32_e32 v0, v0, v1 +; GFX906: v_dot2_f32_f16 v2, v0, 2.0, v2 op_sel_hi:[1,0,1] +; GFX906: v_dot2_f32_f16 v5, v4, v3, v5 +; GFX906: v_add_f32_e32 v0, v2, v5 ; ; GFX950-LABEL: v_fdot2_inline_literal_b_x: ; GFX950: ; %bb.0: @@ -1411,22 +1375,10 @@ define float @v_fdot2_inline_literal_b_x(<2 x half> %a, <2 x half> %b, float %c, ; GFX10: v_dot2c_f32_f16 v5, v4, v3 ; GFX10: v_add_f32_e32 v0, v2, v5 ; -; GFX11-LABEL: v_fdot2_inline_literal_b_x: -; GFX11: ; %bb.0: -; GFX11: v_dual_dot2acc_f32_f16 v2, 0x40004000, v0 :: v_dual_dot2acc_f32_f16 v5, v4, v3 -; GFX11: v_add_f32_e32 v0, v2, v5 -; -; GFX1170-LABEL: v_fdot2_inline_literal_b_x: -; GFX1170: ; %bb.0: -; GFX1170: v_dot2_f32_f16 v0, v0, 0x40004000, v2 -; GFX1170: v_dot2_f32_f16 v1, v4, v3, v5 -; GFX1170: v_add_f32_e32 v0, v0, v1 -; -; GFX12-LABEL: v_fdot2_inline_literal_b_x: -; GFX12: ; %bb.0: -; GFX12: v_dot2_f32_f16 v0, v0, 0x40004000, v2 -; GFX12: v_dot2_f32_f16 v1, v4, v3, v5 -; GFX12: v_add_f32_e32 v0, v0, v1 +; GFX11PLUS-LABEL: v_fdot2_inline_literal_b_x: +; GFX11PLUS: ; %bb.0: +; GFX11PLUS: v_dual_dot2acc_f32_f16 v2, 0x40004000, v0 :: v_dual_dot2acc_f32_f16 v5, v4, v3 +; GFX11PLUS: v_add_f32_e32 v0, v2, v5 %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> <half 2.0, half 2.0>, float %c, i1 false) %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %e, <2 x half> %d, float %f, i1 false) %r = fadd float %r0, %r1 @@ -1436,9 +1388,9 @@ define float @v_fdot2_inline_literal_b_x(<2 x half> %a, <2 x half> %b, float %c, define float @v_fdot2_inline_literal_b_y(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) { ; GFX906-LABEL: v_fdot2_inline_literal_b_y: ; GFX906: ; %bb.0: -; GFX906: v_dot2_f32_f16 v0, v1, v0, v2 -; GFX906: v_dot2_f32_f16 v1, v3, 2.0, v5 op_sel_hi:[1,0,1] -; GFX906: v_add_f32_e32 v0, v0, v1 +; GFX906: v_dot2_f32_f16 v2, v1, v0, v2 +; GFX906: v_dot2_f32_f16 v5, v3, 2.0, v5 op_sel_hi:[1,0,1] +; GFX906: v_add_f32_e32 v0, v2, v5 ; ; GFX950-LABEL: v_fdot2_inline_literal_b_y: ; GFX950: ; %bb.0: @@ -1452,22 +1404,10 @@ define float @v_fdot2_inline_literal_b_y(<2 x half> %a, <2 x half> %b, float %c, ; GFX10: v_dot2c_f32_f16 v5, 0x40004000, v3 ; GFX10: v_add_f32_e32 v0, v2, v5 ; -; GFX11-LABEL: v_fdot2_inline_literal_b_y: -; GFX11: ; %bb.0: -; GFX11: v_dual_dot2acc_f32_f16 v2, v1, v0 :: v_dual_dot2acc_f32_f16 v5, 0x40004000, v3 -; GFX11: v_add_f32_e32 v0, v2, v5 -; -; GFX1170-LABEL: v_fdot2_inline_literal_b_y: -; GFX1170: ; %bb.0: -; GFX1170: v_dot2_f32_f16 v0, v1, v0, v2 -; GFX1170: v_dot2_f32_f16 v1, v3, 0x40004000, v5 -; GFX1170: v_add_f32_e32 v0, v0, v1 -; -; GFX12-LABEL: v_fdot2_inline_literal_b_y: -; GFX12: ; %bb.0: -; GFX12: v_dot2_f32_f16 v0, v1, v0, v2 -; GFX12: v_dot2_f32_f16 v1, v3, 0x40004000, v5 -; GFX12: v_add_f32_e32 v0, v0, v1 +; GFX11PLUS-LABEL: v_fdot2_inline_literal_b_y: +; GFX11PLUS: ; %bb.0: +; GFX11PLUS: v_dual_dot2acc_f32_f16 v2, v1, v0 :: v_dual_dot2acc_f32_f16 v5, 0x40004000, v3 +; GFX11PLUS: v_add_f32_e32 v0, v2, v5 %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %b, <2 x half> %a, float %c, i1 false) %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> <half 2.0, half 2.0>, float %f, i1 false) %r = fadd float %r0, %r1 @@ -1477,9 +1417,9 @@ define float @v_fdot2_inline_literal_b_y(<2 x half> %a, <2 x half> %b, float %c, define float @v_fdot2_inline_literal_b_xy(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) { ; GFX906-LABEL: v_fdot2_inline_literal_b_xy: ; GFX906: ; %bb.0: -; GFX906: v_dot2_f32_f16 v0, v0, 2.0, v2 op_sel_hi:[1,0,1] -; GFX906: v_dot2_f32_f16 v1, v3, 2.0, v5 op_sel_hi:[1,0,1] -; GFX906: v_add_f32_e32 v0, v0, v1 +; GFX906: v_dot2_f32_f16 v2, v0, 2.0, v2 op_sel_hi:[1,0,1] +; GFX906: v_dot2_f32_f16 v5, v3, 2.0, v5 op_sel_hi:[1,0,1] +; GFX906: v_add_f32_e32 v0, v2, v5 ; ; GFX950-LABEL: v_fdot2_inline_literal_b_xy: ; GFX950: ; %bb.0: @@ -1493,22 +1433,10 @@ define float @v_fdot2_inline_literal_b_xy(<2 x half> %a, <2 x half> %b, float %c ; GFX10: v_dot2c_f32_f16 v5, 0x40004000, v3 ; GFX10: v_add_f32_e32 v0, v2, v5 ; -; GFX11-LABEL: v_fdot2_inline_literal_b_xy: -; GFX11: ; %bb.0: -; GFX11: v_dual_dot2acc_f32_f16 v2, 0x40004000, v0 :: v_dual_dot2acc_f32_f16 v5, 0x40004000, v3 -; GFX11: v_add_f32_e32 v0, v2, v5 -; -; GFX1170-LABEL: v_fdot2_inline_literal_b_xy: -; GFX1170: ; %bb.0: -; GFX1170: v_dot2_f32_f16 v0, v0, 0x40004000, v2 -; GFX1170: v_dot2_f32_f16 v1, v3, 0x40004000, v5 -; GFX1170: v_add_f32_e32 v0, v0, v1 -; -; GFX12-LABEL: v_fdot2_inline_literal_b_xy: -; GFX12: ; %bb.0: -; GFX12: v_dot2_f32_f16 v0, v0, 0x40004000, v2 -; GFX12: v_dot2_f32_f16 v1, v3, 0x40004000, v5 -; GFX12: v_add_f32_e32 v0, v0, v1 +; GFX11PLUS-LABEL: v_fdot2_inline_literal_b_xy: +; GFX11PLUS: ; %bb.0: +; GFX11PLUS: v_dual_dot2acc_f32_f16 v2, 0x40004000, v0 :: v_dual_dot2acc_f32_f16 v5, 0x40004000, v3 +; GFX11PLUS: v_add_f32_e32 v0, v2, v5 %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> <half 2.0, half 2.0>, float %c, i1 false) %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> <half 2.0, half 2.0>, float %f, i1 false) %r = fadd float %r0, %r1 @@ -1519,8 +1447,8 @@ define float @v_fdot2_inline_literal_c_dual(<2 x half> %a, <2 x half> %b, <2 x h ; GFX906-LABEL: v_fdot2_inline_literal_c_dual: ; GFX906: ; %bb.0: ; GFX906: v_dot2_f32_f16 v0, v0, v1, 2.0 -; GFX906: v_dot2_f32_f16 v1, v2, v3, v4 -; GFX906: v_add_f32_e32 v0, v0, v1 +; GFX906: v_dot2_f32_f16 v4, v2, v3, v4 +; GFX906: v_add_f32_e32 v0, v0, v4 ; ; GFX950-LABEL: v_fdot2_inline_literal_c_dual: ; GFX950: ; %bb.0: @@ -1542,17 +1470,11 @@ define float @v_fdot2_inline_literal_c_dual(<2 x half> %a, <2 x half> %b, <2 x h ; GFX11: v_dot2acc_f32_f16 v5, v0, v1 ; GFX11: v_add_f32_e32 v0, v5, v4 ; -; GFX1170-LABEL: v_fdot2_inline_literal_c_dual: -; GFX1170: ; %bb.0: -; GFX1170: v_dot2_f32_f16 v0, v0, v1, 2.0 -; GFX1170: v_dot2_f32_f16 v1, v2, v3, v4 -; GFX1170: v_add_f32_e32 v0, v0, v1 -; -; GFX12-LABEL: v_fdot2_inline_literal_c_dual: -; GFX12: ; %bb.0: -; GFX12: v_dot2_f32_f16 v0, v0, v1, 2.0 -; GFX12: v_dot2_f32_f16 v1, v2, v3, v4 -; GFX12: v_add_f32_e32 v0, v0, v1 +; GFX1170-GFX12-LABEL: v_fdot2_inline_literal_c_dual: +; GFX1170-GFX12: ; %bb.0: +; GFX1170-GFX12: v_dot2_f32_f16 v0, v0, v1, 2.0 +; GFX1170-GFX12: v_dot2_f32_f16 v4, v2, v3, v4 +; GFX1170-GFX12: v_add_f32_e32 v0, v0, v4 %r0 = tail call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float 2.0, i1 false) %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false) %r = fadd float %r0, %r1 @@ -1562,9 +1484,9 @@ define float @v_fdot2_inline_literal_c_dual(<2 x half> %a, <2 x half> %b, <2 x h define float @v_fdot2_clamp_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) { ; GCN-LABEL: v_fdot2_clamp_dual: ; GCN: ; %bb.0: -; GCN: v_dot2_f32_f16 v0, v0, v1, v2 clamp -; GCN: v_dot2_f32_f16 v1, v3, v4, v5 clamp -; GCN: v_add_f32_e32 v0, v0, v1 +; GCN: v_dot2_f32_f16 v2, v0, v1, v2 clamp +; GCN: v_dot2_f32_f16 v5, v3, v4, v5 clamp +; GCN: v_add_f32_e32 v0, v2, v5 %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, i1 true) %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 true) %r = fadd float %r0, %r1 _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
