https://github.com/petar-avramovic created 
https://github.com/llvm/llvm-project/pull/196516

For V_DOT2_F32_F16 and V_DOT2_F32_BF16 add their VOPDName and mark
them with usesCustomInserter which will be used to add pre-RA register
allocation hints to preferably assign dst and src2 to the same physical
register. When the hint is satisfied, canMapVOP3PToVOPD recognises the
instruction as eligible for VOPD pairing by checking if it is VOP2 like:
dst==src2, no source modifiers, no clamp, and src1 is a register.
Mark both instructions as commutable to allow a literal in src1 to be
moved to src0, since VOPD only permits a literal in src0.

Original patch had a bug where it did not check if physical src
registers match register class of appropriate operand in fullVOPD
instructions, check is now done via isValidVOPDSrc.

>From 3ba58a54cb984defd100f5afa87d92465c6befb4 Mon Sep 17 00:00:00 2001
From: Petar Avramovic <[email protected]>
Date: Fri, 8 May 2026 11:07:09 +0200
Subject: [PATCH] AMDGPU: Reland: Codegen for v_dual_dot2acc_f32_f16/bf16 from
 VOP3

For V_DOT2_F32_F16 and V_DOT2_F32_BF16 add their VOPDName and mark
them with usesCustomInserter which will be used to add pre-RA register
allocation hints to preferably assign dst and src2 to the same physical
register. When the hint is satisfied, canMapVOP3PToVOPD recognises the
instruction as eligible for VOPD pairing by checking if it is VOP2 like:
dst==src2, no source modifiers, no clamp, and src1 is a register.
Mark both instructions as commutable to allow a literal in src1 to be
moved to src0, since VOPD only permits a literal in src0.

Original patch had a bug where it did not check if physical src
registers match register class of appropriate operand in fullVOPD
instructions, check is now done via isValidVOPDSrc.
---
 llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp       |  33 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |   8 +
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp    |   6 +
 llvm/lib/Target/AMDGPU/VOP3PInstructions.td   |  13 +-
 llvm/lib/Target/AMDGPU/VOPInstructions.td     |   4 +-
 .../AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll      | 235 +++--
 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll | 972 ++++++++----------
 7 files changed, 669 insertions(+), 602 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp 
b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp
index fb26175769d94..254569d20113b 100644
--- a/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp
@@ -60,6 +60,36 @@ bool isValidVOPDSrc(const SIInstrInfo &TII, int VOPDOpc, 
unsigned CompIdx,
   return TII.getRegClass(TII.get(VOPDOpc), Idx)->contains(PhysSrcReg);
 }
 
+// Check if MI is a VOP3P instruction with operands that satisfy the 
constraints
+// for mapping it to a VOP2/VOPD opcode: no modifiers, no clamp, src1 and src2
+// are registers (src0 can be register or literal), and src2 is same as dst.
+static bool canMapVOP3PToVOPD(const MachineInstr &MI) {
+  unsigned Opc = MI.getOpcode();
+  if (Opc != AMDGPU::V_DOT2_F32_F16 && Opc != AMDGPU::V_DOT2_F32_BF16)
+    return false;
+  // src0 can be register or literal
+  int16_t Src0ModsIdx = getNamedOperandIdx(Opc, 
AMDGPU::OpName::src0_modifiers);
+  if (MI.getOperand(Src0ModsIdx).getImm() != SISrcMods::OP_SEL_1)
+    return false;
+  int16_t Src1ModsIdx = getNamedOperandIdx(Opc, 
AMDGPU::OpName::src1_modifiers);
+  if (MI.getOperand(Src1ModsIdx).getImm() != SISrcMods::OP_SEL_1)
+    return false;
+  int16_t Src1Idx = getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
+  if (!MI.getOperand(Src1Idx).isReg())
+    return false;
+  int16_t Src2ModsIdx = getNamedOperandIdx(Opc, 
AMDGPU::OpName::src2_modifiers);
+  if (MI.getOperand(Src2ModsIdx).getImm() != SISrcMods::OP_SEL_1)
+    return false;
+  int16_t Src2Idx = getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
+  if (!MI.getOperand(Src2Idx).isReg())
+    return false;
+  int16_t ClampIdx = getNamedOperandIdx(Opc, AMDGPU::OpName::clamp);
+  if (MI.getOperand(ClampIdx).getImm() != 0)
+    return false;
+  int16_t VdstIdx = getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
+  return MI.getOperand(VdstIdx).getReg() == MI.getOperand(Src2Idx).getReg();
+}
+
 bool llvm::checkVOPDRegConstraints(const SIInstrInfo &TII,
                                    const MachineInstr &MIX,
                                    const MachineInstr &MIY, bool IsVOPD3,
@@ -71,7 +101,8 @@ bool llvm::checkVOPDRegConstraints(const SIInstrInfo &TII,
 
   if (IsVOPD3 && !ST.hasVOPD3())
     return false;
-  if (!IsVOPD3 && (TII.isVOP3(MIX) || TII.isVOP3(MIY)))
+  if (!IsVOPD3 && ((TII.isVOP3(MIX) && !canMapVOP3PToVOPD(MIX)) ||
+                   (TII.isVOP3(MIY) && !canMapVOP3PToVOPD(MIY))))
     return false;
   if (TII.isDPP(MIX) || TII.isDPP(MIY))
     return false;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 20599228beea8..92642165c161b 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -7223,6 +7223,14 @@ 
SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     MI.getOperand(0).setReg(OriginalExec);
     return BB;
   }
+  case AMDGPU::V_DOT2_F32_F16:
+  case AMDGPU::V_DOT2_F32_BF16: {
+    // Hint RA to assign dst and src2 the same physical register.
+    // For targets without VOP2, but with VOPD, variant of the instruction this
+    // is one of the conditions to attempt converting VOP3P to VOPD.
+    MRI.setSimpleHint(MI.getOperand(0).getReg(), MI.getOperand(6).getReg());
+    return BB;
+  }
   default:
     if (TII->isImage(MI) || TII->isMUBUF(MI)) {
       if (!MI.mayStore())
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp 
b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 1c145359ccc61..fafbb0d9198fa 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -893,6 +893,12 @@ ComponentProps::ComponentProps(const MCInstrDesc &OpDesc, 
bool VOP3Layout) {
     NumVOPD3Mods = 2;
     if (IsVOP3)
       SrcOperandsNum = 3;
+  } else if (Opcode == AMDGPU::V_DOT2_F32_F16 ||
+             Opcode == AMDGPU::V_DOT2_F32_BF16) {
+    // VOP3P opcodes that have VOPD but don't have VOP2 version. Using VOPD3
+    // path in getIndexOfSrcInMCOperands to get correct src operand indexes,
+    // but generating VOPD, not VOPD3.
+    NumVOPD3Mods = SrcOperandsNum;
   } else if (isSISrcFPOperand(OpDesc,
                               getNamedOperandIdx(Opcode, OpName::src0))) {
     // All FP VOPD instructions have Neg modifiers for all operands except
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td 
b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 7ae93b30e7f03..0c281360409fa 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -87,11 +87,13 @@ multiclass VOP3PInst<string OpName, VOPProfile P,
 }
 
 multiclass VOP3PInstDotWithDual<string OpName, VOPProfile P,
-                                SDPatternOperator node = null_frag> {
+                                SDPatternOperator node = null_frag,
+                                bits<6> VOPDOp, string VOPDName> {
   def NAME : VOP3P_Pseudo<OpName, P,
                           getVOP3PModPat<P, node,
                                          1 /*HasExplicitClamp*/, 1/*IsDOT*/,
-                                         VOP3PModsDOT, VOP3PModsF32>.ret>;
+                                         VOP3PModsDOT, VOP3PModsF32>.ret>,
+             VOPD_Component<VOPDOp, VOPDName>;
   let SubtargetPredicate = isGFX11Plus in {
   if P.HasExtVOP3DPP then
     def _dpp : VOP3_DPP_Pseudo<OpName, P> {
@@ -612,12 +614,12 @@ defm V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16",
   VOP3P_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_udot2, 1>;
 } // End OtherPredicates = [HasDot2Insts]
 
-let OtherPredicates = [HasDot10Insts] in
+let OtherPredicates = [HasDot10Insts], isCommutable = 1, usesCustomInserter = 
1 in
 defm V_DOT2_F32_F16 :
   VOP3PInstDotWithDual<"v_dot2_f32_f16",
                        VOP3P_Profile<VOP_F32_V2F16_V2F16_F32, VOP3_REGULAR,
                                      /*HasDPP*/ 1>,
-                       AMDGPUfdot2>;
+                       AMDGPUfdot2, 0xC, "v_dot2acc_f32_f16">;
 
 let OtherPredicates = [HasDot7Insts] in {
 defm V_DOT4_U32_U8  : VOP3PInst<"v_dot4_u32_u8",
@@ -640,9 +642,10 @@ def DOT2_BF16_Profile
 
 let SubtargetPredicate = HasDot12Insts  in {
 
+let isCommutable = 1, usesCustomInserter = 1 in
 defm V_DOT2_F32_BF16 :
   VOP3PInstDotWithDual<"v_dot2_f32_bf16", DOT2_BF16_Profile,
-                       int_amdgcn_fdot2_f32_bf16>;
+                       int_amdgcn_fdot2_f32_bf16, 0xD, "v_dot2acc_f32_bf16">;
 
 } // End SubtargetPredicate = HasDot12Insts
 
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td 
b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 82545a472cf17..40b0476a84d25 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -34,8 +34,8 @@ class VOP <string opName> {
   string OpName = opName;
 }
 
-// First 13 insts from VOPDY are also VOPDX. DOT2ACC_F32_BF16 is omitted
-defvar VOPDX_Max_Index = 12;
+// First 13 insts from VOPDY are also VOPDX.
+defvar VOPDX_Max_Index = 13;
 defvar VOPD3X_Max_Index = 36;
 
 class VOPD_Component<bits<6> OpIn, string vOPDName> {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll 
b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
index 1dff54ac35427..87142517d1448 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
@@ -225,7 +225,7 @@ define float @v_fdot2_f32_bf16_inline_literal_b(<2 x 
bfloat> %a, float %c) {
 ;
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_b:
 ; GFX11PLUS:  ; %bb.0:
-; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, 0x40004000, v1
+; GFX11PLUS:    v_dot2_f32_bf16 v0, 0x40004000, v0, v1
   %ret = tail call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x 
bfloat> <bfloat 2.0, bfloat 2.0>, float %c, i1 false)
   ret float %ret
 }
@@ -373,7 +373,7 @@ define float @v_fdot2_f32_bf16_inline_literal_b_clamp(<2 x 
bfloat> %a, float %c)
 ;
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_b_clamp:
 ; GFX11PLUS:  ; %bb.0:
-; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, 0x40004000, v1 clamp
+; GFX11PLUS:    v_dot2_f32_bf16 v0, 0x40004000, v0, v1 clamp
   %ret = tail call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x 
bfloat> <bfloat 2.0, bfloat 2.0>, float %c, i1 true)
   ret float %ret
 }
@@ -395,9 +395,119 @@ define float @v_fdot2_f32_bf16_dual(<2 x bfloat> %a, <2 x 
bfloat> %b, float %c,
 ;
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_dual:
 ; GFX11PLUS:  ; %bb.0:
-; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, v1, v2
-; GFX11PLUS:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX11PLUS:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS:    v_dual_dot2acc_f32_bf16 v2, v0, v1 :: v_dual_dot2acc_f32_bf16 
v5, v3, v4
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
+  %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> 
%b, float %c, i1 false)
+  %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> 
%e, float %f, i1 false)
+  %r = fadd float %r0, %r1
+  ret float %r
+}
+
+
+
+
+define float @v_fdot2_f32_bf16_dual_sgpr_src0_x(<2 x bfloat> inreg %a, <2 x 
bfloat> %b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) {
+; GFX950-LABEL: v_fdot2_f32_bf16_dual_sgpr_src0_x:
+; GFX950:  ; %bb.0:
+; GFX950:    v_dot2c_f32_bf16_e32 v1, s0, v0
+; GFX950:    v_dot2c_f32_bf16_e32 v4, v2, v3
+; GFX950:    v_add_f32_e32 v0, v1, v4
+;
+; GFX11PLUS-LABEL: v_fdot2_f32_bf16_dual_sgpr_src0_x:
+; GFX11PLUS:  ; %bb.0:
+; GFX11PLUS:    v_dual_dot2acc_f32_bf16 v1, s0, v0 :: v_dual_dot2acc_f32_bf16 
v4, v2, v3
+; GFX11PLUS:    v_add_f32_e32 v0, v1, v4
+  %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> 
%b, float %c, i1 false)
+  %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> 
%e, float %f, i1 false)
+  %r = fadd float %r0, %r1
+  ret float %r
+}
+
+define float @v_fdot2_f32_bf16_dual_sgpr_src1_x(<2 x bfloat> %a, <2 x bfloat> 
inreg %b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) {
+; GFX950-LABEL: v_fdot2_f32_bf16_dual_sgpr_src1_x:
+; GFX950:  ; %bb.0:
+; GFX950:    v_dot2c_f32_bf16_e32 v1, s0, v0
+; GFX950:    v_dot2c_f32_bf16_e32 v4, v2, v3
+; GFX950:    v_add_f32_e32 v0, v1, v4
+;
+; GFX11PLUS-LABEL: v_fdot2_f32_bf16_dual_sgpr_src1_x:
+; GFX11PLUS:  ; %bb.0:
+; GFX11PLUS:    v_dot2_f32_bf16 v1, v0, s0, v1
+; GFX11PLUS:    v_dot2_f32_bf16 v4, v2, v3, v4
+; GFX11PLUS:    v_add_f32_e32 v0, v1, v4
+  %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> 
%b, float %c, i1 false)
+  %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> 
%e, float %f, i1 false)
+  %r = fadd float %r0, %r1
+  ret float %r
+}
+
+define float @v_fdot2_f32_bf16_dual_sgpr_src2_x(<2 x bfloat> %a, <2 x bfloat> 
%b, float inreg %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) {
+; GFX950-LABEL: v_fdot2_f32_bf16_dual_sgpr_src2_x:
+; GFX950:  ; %bb.0:
+; GFX950:    v_mov_b32_e32 v5, s0
+; GFX950:    v_dot2c_f32_bf16_e32 v5, v0, v1
+; GFX950:    v_dot2c_f32_bf16_e32 v4, v2, v3
+; GFX950:    v_add_f32_e32 v0, v5, v4
+;
+; GFX11PLUS-LABEL: v_fdot2_f32_bf16_dual_sgpr_src2_x:
+; GFX11PLUS:  ; %bb.0:
+; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, v1, s0
+; GFX11PLUS:    v_dot2_f32_bf16 v4, v2, v3, v4
+; GFX11PLUS:    v_add_f32_e32 v0, v0, v4
+  %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> 
%b, float %c, i1 false)
+  %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> 
%e, float %f, i1 false)
+  %r = fadd float %r0, %r1
+  ret float %r
+}
+
+define float @v_fdot2_f32_bf16_dual_sgpr_src0_y(<2 x bfloat> %a, <2 x bfloat> 
%b, float %c, <2 x bfloat> inreg %d, <2 x bfloat> %e, <2 x bfloat> 
%vopd_dst_pad, float %f) {
+; GFX950-LABEL: v_fdot2_f32_bf16_dual_sgpr_src0_y:
+; GFX950:  ; %bb.0:
+; GFX950:    v_dot2c_f32_bf16_e32 v2, v0, v1
+; GFX950:    v_dot2c_f32_bf16_e32 v5, s0, v3
+; GFX950:    v_add_f32_e32 v0, v2, v5
+;
+; GFX11PLUS-LABEL: v_fdot2_f32_bf16_dual_sgpr_src0_y:
+; GFX11PLUS:  ; %bb.0:
+; GFX11PLUS:    v_dual_dot2acc_f32_bf16 v2, v0, v1 :: v_dual_dot2acc_f32_bf16 
v5, s0, v3
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
+  %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> 
%b, float %c, i1 false)
+  %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> 
%e, float %f, i1 false)
+  %r = fadd float %r0, %r1
+  ret float %r
+}
+
+define float @v_fdot2_f32_bf16_dual_sgpr_src1_y(<2 x bfloat> %a, <2 x bfloat> 
%b, float %c, <2 x bfloat> %d, <2 x bfloat> %vopd_dst_pad, <2 x bfloat> inreg 
%e, float %f) {
+; GFX950-LABEL: v_fdot2_f32_bf16_dual_sgpr_src1_y:
+; GFX950:  ; %bb.0:
+; GFX950:    v_dot2c_f32_bf16_e32 v2, v0, v1
+; GFX950:    v_dot2c_f32_bf16_e32 v5, s0, v3
+; GFX950:    v_add_f32_e32 v0, v2, v5
+;
+; GFX11PLUS-LABEL: v_fdot2_f32_bf16_dual_sgpr_src1_y:
+; GFX11PLUS:  ; %bb.0:
+; GFX11PLUS:    v_dot2_f32_bf16 v2, v0, v1, v2
+; GFX11PLUS:    v_dot2_f32_bf16 v5, v3, s0, v5
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
+  %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> 
%b, float %c, i1 false)
+  %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> 
%e, float %f, i1 false)
+  %r = fadd float %r0, %r1
+  ret float %r
+}
+
+define float @v_fdot2_f32_bf16_dual_sgpr_src2_y(<2 x bfloat> %a, <2 x bfloat> 
%b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float inreg %f) {
+; GFX950-LABEL: v_fdot2_f32_bf16_dual_sgpr_src2_y:
+; GFX950:  ; %bb.0:
+; GFX950:    v_dot2c_f32_bf16_e32 v2, v0, v1
+; GFX950:    v_mov_b32_e32 v0, s0
+; GFX950:    v_dot2c_f32_bf16_e32 v0, v3, v4
+; GFX950:    v_add_f32_e32 v0, v2, v0
+;
+; GFX11PLUS-LABEL: v_fdot2_f32_bf16_dual_sgpr_src2_y:
+; GFX11PLUS:  ; %bb.0:
+; GFX11PLUS:    v_dot2_f32_bf16 v2, v0, v1, v2
+; GFX11PLUS:    v_dot2_f32_bf16 v0, v3, v4, s0
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v0
   %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> 
%b, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> 
%e, float %f, i1 false)
   %r = fadd float %r0, %r1
@@ -407,15 +517,15 @@ define float @v_fdot2_f32_bf16_dual(<2 x bfloat> %a, <2 x 
bfloat> %b, float %c,
 define float @v_fdot2_f32_bf16_neg_a_dual(<2 x bfloat> %a, <2 x bfloat> %b, 
float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) {
 ; GFX950-LABEL: v_fdot2_f32_bf16_neg_a_dual:
 ; GFX950:  ; %bb.0:
-; GFX950:    v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX950:    v_dot2_f32_bf16 v2, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GFX950:    v_dot2c_f32_bf16_e32 v5, v3, v4
-; GFX950:    v_add_f32_e32 v0, v0, v5
+; GFX950:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_a_dual:
 ; GFX11PLUS:  ; %bb.0:
-; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
-; GFX11PLUS:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX11PLUS:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS:    v_dot2_f32_bf16 v2, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX11PLUS:    v_dot2_f32_bf16 v5, v3, v4, v5
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
   %neg.a = fneg <2 x bfloat> %a
   %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %neg.a, <2 x 
bfloat> %b, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> 
%e, float %f, i1 false)
@@ -436,9 +546,8 @@ define float @v_fdot2_f32_bf16_neg_a_lo_dual(<2 x bfloat> 
%a, <2 x bfloat> %b, f
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_a_lo_dual:
 ; GFX11PLUS:  ; %bb.0:
 ; GFX11PLUS:    v_xor_b16 v0.l, 0x8000, v0.l
-; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, v1, v2
-; GFX11PLUS:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX11PLUS:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS:    v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 
v2, v0, v1
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
   %a_lo = extractelement <2 x bfloat> %a, i32 0
   %neg.a_lo = fneg bfloat %a_lo
   %neg_lo.a = insertelement <2 x bfloat> %a, bfloat %neg.a_lo, i32 0
@@ -462,9 +571,8 @@ define float @v_fdot2_f32_bf16_neg_a_hi_dual(<2 x bfloat> 
%a, <2 x bfloat> %b, f
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_a_hi_dual:
 ; GFX11PLUS:  ; %bb.0:
 ; GFX11PLUS:    v_xor_b16 v0.h, 0x8000, v0.h
-; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, v1, v2
-; GFX11PLUS:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX11PLUS:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS:    v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 
v2, v0, v1
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
   %a_hi = extractelement <2 x bfloat> %a, i32 1
   %neg.a_hi = fneg bfloat %a_hi
   %neg_hi.a = insertelement <2 x bfloat> %a, bfloat %neg.a_hi, i32 1
@@ -477,15 +585,15 @@ define float @v_fdot2_f32_bf16_neg_a_hi_dual(<2 x bfloat> 
%a, <2 x bfloat> %b, f
 define float @v_fdot2_f32_bf16_neg_b_dual(<2 x bfloat> %a, <2 x bfloat> %b, 
float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) {
 ; GFX950-LABEL: v_fdot2_f32_bf16_neg_b_dual:
 ; GFX950:  ; %bb.0:
-; GFX950:    v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX950:    v_dot2_f32_bf16 v2, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
 ; GFX950:    v_dot2c_f32_bf16_e32 v5, v3, v4
-; GFX950:    v_add_f32_e32 v0, v0, v5
+; GFX950:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_b_dual:
 ; GFX11PLUS:  ; %bb.0:
-; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
-; GFX11PLUS:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX11PLUS:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS:    v_dot2_f32_bf16 v2, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX11PLUS:    v_dot2_f32_bf16 v5, v3, v4, v5
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
   %neg.b = fneg <2 x bfloat> %b
   %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> 
%neg.b, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> 
%e, float %f, i1 false)
@@ -506,9 +614,8 @@ define float @v_fdot2_f32_bf16_neg_b_lo_dual(<2 x bfloat> 
%a, <2 x bfloat> %b, f
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_b_lo_dual:
 ; GFX11PLUS:  ; %bb.0:
 ; GFX11PLUS:    v_xor_b16 v1.l, 0x8000, v1.l
-; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, v1, v2
-; GFX11PLUS:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX11PLUS:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS:    v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 
v2, v0, v1
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
   %b_lo = extractelement <2 x bfloat> %b, i32 0
   %neg.b_lo = fneg bfloat %b_lo
   %neg_lo.b = insertelement <2 x bfloat> %b, bfloat %neg.b_lo, i32 0
@@ -532,9 +639,8 @@ define float @v_fdot2_f32_bf16_neg_b_hi_dual(<2 x bfloat> 
%a, <2 x bfloat> %b, f
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_b_hi_dual:
 ; GFX11PLUS:  ; %bb.0:
 ; GFX11PLUS:    v_xor_b16 v1.h, 0x8000, v1.h
-; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, v1, v2
-; GFX11PLUS:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX11PLUS:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS:    v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 
v2, v0, v1
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
   %b_hi = extractelement <2 x bfloat> %b, i32 1
   %neg.b_hi = fneg bfloat %b_hi
   %neg_hi.b = insertelement <2 x bfloat> %b, bfloat %neg.b_hi, i32 1
@@ -547,15 +653,15 @@ define float @v_fdot2_f32_bf16_neg_b_hi_dual(<2 x bfloat> 
%a, <2 x bfloat> %b, f
 define float @v_fdot2_f32_bf16_neg_c_dual(<2 x bfloat> %a, <2 x bfloat> %b, 
float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) {
 ; GFX950-LABEL: v_fdot2_f32_bf16_neg_c_dual:
 ; GFX950:  ; %bb.0:
-; GFX950:    v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,0,1]
+; GFX950:    v_dot2_f32_bf16 v2, v0, v1, v2 neg_lo:[0,0,1]
 ; GFX950:    v_dot2c_f32_bf16_e32 v5, v3, v4
-; GFX950:    v_add_f32_e32 v0, v0, v5
+; GFX950:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_c_dual:
 ; GFX11PLUS:  ; %bb.0:
-; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,0,1]
-; GFX11PLUS:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX11PLUS:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS:    v_dot2_f32_bf16 v2, v0, v1, v2 neg_lo:[0,0,1]
+; GFX11PLUS:    v_dot2_f32_bf16 v5, v3, v4, v5
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
   %neg.c = fneg float %c
   %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> 
%b, float %neg.c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> 
%e, float %f, i1 false)
@@ -566,15 +672,15 @@ define float @v_fdot2_f32_bf16_neg_c_dual(<2 x bfloat> 
%a, <2 x bfloat> %b, floa
 define float @v_fdot2_f32_bf16_abs_c_dual(<2 x bfloat> %a, <2 x bfloat> %b, 
float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) {
 ; GFX950-LABEL: v_fdot2_f32_bf16_abs_c_dual:
 ; GFX950:  ; %bb.0:
-; GFX950:    v_dot2_f32_bf16 v0, v0, v1, v2 neg_hi:[0,0,1]
+; GFX950:    v_dot2_f32_bf16 v2, v0, v1, v2 neg_hi:[0,0,1]
 ; GFX950:    v_dot2c_f32_bf16_e32 v5, v3, v4
-; GFX950:    v_add_f32_e32 v0, v0, v5
+; GFX950:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_abs_c_dual:
 ; GFX11PLUS:  ; %bb.0:
-; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, v1, v2 neg_hi:[0,0,1]
-; GFX11PLUS:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX11PLUS:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS:    v_dot2_f32_bf16 v2, v0, v1, v2 neg_hi:[0,0,1]
+; GFX11PLUS:    v_dot2_f32_bf16 v5, v3, v4, v5
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
   %abs.c = call float @llvm.fabs.f32(float %c)
   %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> 
%b, float %abs.c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> 
%e, float %f, i1 false)
@@ -594,9 +700,8 @@ define float @v_fdot2_f32_bf16_opsel_lo_a_dual(<2 x bfloat> 
%a, <2 x bfloat> %b,
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_opsel_lo_a_dual:
 ; GFX11PLUS:  ; %bb.0:
 ; GFX11PLUS:    v_mov_b16_e32 v0.l, v0.h
-; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, v1, v2
-; GFX11PLUS:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX11PLUS:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS:    v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 
v2, v0, v1
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
   %shuf = shufflevector <2 x bfloat> %a, <2 x bfloat> poison, <2 x i32> <i32 
1, i32 1>
   %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %shuf, <2 x 
bfloat> %b, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> 
%e, float %f, i1 false)
@@ -616,9 +721,8 @@ define float @v_fdot2_f32_bf16_opsel_hi_a_dual(<2 x bfloat> 
%a, <2 x bfloat> %b,
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_opsel_hi_a_dual:
 ; GFX11PLUS:  ; %bb.0:
 ; GFX11PLUS:    v_mov_b16_e32 v0.h, v0.l
-; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, v1, v2
-; GFX11PLUS:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX11PLUS:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS:    v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 
v2, v0, v1
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
   %shuf = shufflevector <2 x bfloat> %a, <2 x bfloat> poison, <2 x i32> <i32 
0, i32 0>
   %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %shuf, <2 x 
bfloat> %b, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> 
%e, float %f, i1 false)
@@ -638,9 +742,8 @@ define float @v_fdot2_f32_bf16_opsel_lo_b_dual(<2 x bfloat> 
%a, <2 x bfloat> %b,
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_opsel_lo_b_dual:
 ; GFX11PLUS:  ; %bb.0:
 ; GFX11PLUS:    v_mov_b16_e32 v1.l, v1.h
-; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, v1, v2
-; GFX11PLUS:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX11PLUS:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS:    v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 
v2, v0, v1
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
   %shuf = shufflevector <2 x bfloat> %b, <2 x bfloat> poison, <2 x i32> <i32 
1, i32 1>
   %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> 
%shuf, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> 
%e, float %f, i1 false)
@@ -660,9 +763,8 @@ define float @v_fdot2_f32_bf16_opsel_hi_b_dual(<2 x bfloat> 
%a, <2 x bfloat> %b,
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_opsel_hi_b_dual:
 ; GFX11PLUS:  ; %bb.0:
 ; GFX11PLUS:    v_mov_b16_e32 v1.h, v1.l
-; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, v1, v2
-; GFX11PLUS:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX11PLUS:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS:    v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 
v2, v0, v1
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
   %shuf = shufflevector <2 x bfloat> %b, <2 x bfloat> poison, <2 x i32> <i32 
0, i32 0>
   %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> 
%shuf, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> 
%e, float %f, i1 false)
@@ -682,9 +784,8 @@ define float @v_fdot2_f32_bf16_inline_literal_a_y(<2 x 
bfloat> %a, <2 x bfloat>
 ;
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_a_y:
 ; GFX11PLUS:  ; %bb.0:
-; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, v1, v2
-; GFX11PLUS:    v_dot2_f32_bf16 v1, 0x40004000, v4, v5
-; GFX11PLUS:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS:    v_dual_dot2acc_f32_bf16 v2, v0, v1 :: v_dual_dot2acc_f32_bf16 
v5, 0x40004000, v4
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
   %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> 
%b, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> <bfloat 2.0, 
bfloat 2.0>, <2 x bfloat> %e, float %f, i1 false)
   %r = fadd float %r0, %r1
@@ -700,9 +801,8 @@ define float @v_fdot2_f32_bf16_inline_literal_a_xy(<2 x 
bfloat> %a, <2 x bfloat>
 ;
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_a_xy:
 ; GFX11PLUS:  ; %bb.0:
-; GFX11PLUS:    v_dot2_f32_bf16 v0, 0x40004000, v1, v2
-; GFX11PLUS:    v_dot2_f32_bf16 v1, 0x40004000, v4, v5
-; GFX11PLUS:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS:    v_dual_dot2acc_f32_bf16 v2, 0x40004000, v1 :: 
v_dual_dot2acc_f32_bf16 v5, 0x40004000, v4
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
   %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> <bfloat 2.0, 
bfloat 2.0>, <2 x bfloat> %b, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> <bfloat 2.0, 
bfloat 2.0>, <2 x bfloat> %e, float %f, i1 false)
   %r = fadd float %r0, %r1
@@ -718,9 +818,8 @@ define float @v_fdot2_f32_bf16_inline_literal_b_x(<2 x 
bfloat> %a, <2 x bfloat>
 ;
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_b_x:
 ; GFX11PLUS:  ; %bb.0:
-; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, 0x40004000, v2
-; GFX11PLUS:    v_dot2_f32_bf16 v1, v4, v3, v5
-; GFX11PLUS:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS:    v_dual_dot2acc_f32_bf16 v2, 0x40004000, v0 :: 
v_dual_dot2acc_f32_bf16 v5, v4, v3
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
   %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> 
<bfloat 2.0, bfloat 2.0>, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %e, <2 x bfloat> 
%d, float %f, i1 false)
   %r = fadd float %r0, %r1
@@ -736,9 +835,8 @@ define float @v_fdot2_f32_bf16_inline_literal_b_y(<2 x 
bfloat> %a, <2 x bfloat>
 ;
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_b_y:
 ; GFX11PLUS:  ; %bb.0:
-; GFX11PLUS:    v_dot2_f32_bf16 v0, v1, v0, v2
-; GFX11PLUS:    v_dot2_f32_bf16 v1, v3, 0x40004000, v5
-; GFX11PLUS:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS:    v_dual_dot2acc_f32_bf16 v2, v1, v0 :: v_dual_dot2acc_f32_bf16 
v5, 0x40004000, v3
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
   %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %b, <2 x bfloat> 
%a, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> 
<bfloat 2.0, bfloat 2.0>, float %f, i1 false)
   %r = fadd float %r0, %r1
@@ -754,9 +852,8 @@ define float @v_fdot2_f32_bf16_inline_literal_b_xy(<2 x 
bfloat> %a, <2 x bfloat>
 ;
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_b_xy:
 ; GFX11PLUS:  ; %bb.0:
-; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, 0x40004000, v2
-; GFX11PLUS:    v_dot2_f32_bf16 v1, v3, 0x40004000, v5
-; GFX11PLUS:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS:    v_dual_dot2acc_f32_bf16 v2, 0x40004000, v0 :: 
v_dual_dot2acc_f32_bf16 v5, 0x40004000, v3
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
   %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> 
<bfloat 2.0, bfloat 2.0>, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> 
<bfloat 2.0, bfloat 2.0>, float %f, i1 false)
   %r = fadd float %r0, %r1
@@ -774,8 +871,8 @@ define float @v_fdot2_f32_bf16_inline_literal_c_dual(<2 x 
bfloat> %a, <2 x bfloa
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_c_dual:
 ; GFX11PLUS:  ; %bb.0:
 ; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, v1, 2.0
-; GFX11PLUS:    v_dot2_f32_bf16 v1, v2, v3, v4
-; GFX11PLUS:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS:    v_dot2_f32_bf16 v4, v2, v3, v4
+; GFX11PLUS:    v_add_f32_e32 v0, v0, v4
   %r0 = tail call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x 
bfloat> %b, float 2.0, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> 
%e, float %f, i1 false)
   %r = fadd float %r0, %r1
@@ -785,9 +882,9 @@ define float @v_fdot2_f32_bf16_inline_literal_c_dual(<2 x 
bfloat> %a, <2 x bfloa
 define float @v_fdot2_f32_bf16_clamp_dual(<2 x bfloat> %a, <2 x bfloat> %b, 
float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) {
 ; GCN-LABEL: v_fdot2_f32_bf16_clamp_dual:
 ; GCN:  ; %bb.0:
-; GCN:    v_dot2_f32_bf16 v0, v0, v1, v2 clamp
-; GCN:    v_dot2_f32_bf16 v1, v3, v4, v5 clamp
-; GCN:    v_add_f32_e32 v0, v0, v1
+; GCN:    v_dot2_f32_bf16 v2, v0, v1, v2 clamp
+; GCN:    v_dot2_f32_bf16 v5, v3, v4, v5 clamp
+; GCN:    v_add_f32_e32 v0, v2, v5
   %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> 
%b, float %c, i1 true)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> 
%e, float %f, i1 true)
   %r = fadd float %r0, %r1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll 
b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll
index a16cc091eb766..007757a643535 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll
@@ -3,8 +3,8 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck %s 
--check-prefixes=GCN,GFX950
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck %s 
--check-prefixes=GCN,GFX10
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s 
--check-prefixes=GCN,GFX11PLUS,GFX11
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1170 < %s | FileCheck %s 
--check-prefixes=GCN,GFX11PLUS,GFX1170
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s 
--check-prefixes=GCN,GFX11PLUS,GFX12
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1170 < %s | FileCheck %s 
--check-prefixes=GCN,GFX11PLUS,GFX1170-GFX12
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck 
%s --check-prefixes=GCN,GFX11PLUS,GFX1170-GFX12
 
 declare float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, i1 
%clamp)
 
@@ -28,13 +28,9 @@ define float @v_fdot2(<2 x half> %a, <2 x half> %b, float 
%c) {
 ; GFX11:    v_dot2acc_f32_f16 v2, v0, v1
 ; GFX11:    v_mov_b32_e32 v0, v2
 ;
-; GFX1170-LABEL: v_fdot2:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2
-;
-; GFX12-LABEL: v_fdot2:
-; GFX12:  ; %bb.0:
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
+; GFX1170-GFX12-LABEL: v_fdot2:
+; GFX1170-GFX12:  ; %bb.0:
+; GFX1170-GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
   %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, 
i1 false)
   ret float %r
 }
@@ -71,15 +67,10 @@ define float @v_fdot2_neg_a_lo(<2 x half> %a, <2 x half> 
%b, float %c) {
 ; GFX11:    v_dot2acc_f32_f16 v2, v0, v1
 ; GFX11:    v_mov_b32_e32 v0, v2
 ;
-; GFX1170-LABEL: v_fdot2_neg_a_lo:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_xor_b16 v0.l, 0x8000, v0.l
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2
-;
-; GFX12-LABEL: v_fdot2_neg_a_lo:
-; GFX12:  ; %bb.0:
-; GFX12:    v_xor_b16 v0.l, 0x8000, v0.l
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
+; GFX1170-GFX12-LABEL: v_fdot2_neg_a_lo:
+; GFX1170-GFX12:  ; %bb.0:
+; GFX1170-GFX12:    v_xor_b16 v0.l, 0x8000, v0.l
+; GFX1170-GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
   %a_lo = extractelement <2 x half> %a, i32 0
   %neg.a_lo = fneg half %a_lo
   %neg_lo.a = insertelement <2 x half> %a, half %neg.a_lo, i32 0
@@ -111,15 +102,10 @@ define float @v_fdot2_neg_a_hi(<2 x half> %a, <2 x half> 
%b, float %c) {
 ; GFX11:    v_dot2acc_f32_f16 v2, v0, v1
 ; GFX11:    v_mov_b32_e32 v0, v2
 ;
-; GFX1170-LABEL: v_fdot2_neg_a_hi:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_xor_b16 v0.h, 0x8000, v0.h
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2
-;
-; GFX12-LABEL: v_fdot2_neg_a_hi:
-; GFX12:  ; %bb.0:
-; GFX12:    v_xor_b16 v0.h, 0x8000, v0.h
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
+; GFX1170-GFX12-LABEL: v_fdot2_neg_a_hi:
+; GFX1170-GFX12:  ; %bb.0:
+; GFX1170-GFX12:    v_xor_b16 v0.h, 0x8000, v0.h
+; GFX1170-GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
   %a_hi = extractelement <2 x half> %a, i32 1
   %neg.a_hi = fneg half %a_hi
   %neg_hi.a = insertelement <2 x half> %a, half %neg.a_hi, i32 1
@@ -159,15 +145,10 @@ define float @v_fdot2_neg_b_lo(<2 x half> %a, <2 x half> 
%b, float %c) {
 ; GFX11:    v_dot2acc_f32_f16 v2, v0, v1
 ; GFX11:    v_mov_b32_e32 v0, v2
 ;
-; GFX1170-LABEL: v_fdot2_neg_b_lo:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_xor_b16 v1.l, 0x8000, v1.l
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2
-;
-; GFX12-LABEL: v_fdot2_neg_b_lo:
-; GFX12:  ; %bb.0:
-; GFX12:    v_xor_b16 v1.l, 0x8000, v1.l
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
+; GFX1170-GFX12-LABEL: v_fdot2_neg_b_lo:
+; GFX1170-GFX12:  ; %bb.0:
+; GFX1170-GFX12:    v_xor_b16 v1.l, 0x8000, v1.l
+; GFX1170-GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
   %b_lo = extractelement <2 x half> %b, i32 0
   %neg.b_lo = fneg half %b_lo
   %neg_lo.b = insertelement <2 x half> %b, half %neg.b_lo, i32 0
@@ -199,15 +180,10 @@ define float @v_fdot2_neg_b_hi(<2 x half> %a, <2 x half> 
%b, float %c) {
 ; GFX11:    v_dot2acc_f32_f16 v2, v0, v1
 ; GFX11:    v_mov_b32_e32 v0, v2
 ;
-; GFX1170-LABEL: v_fdot2_neg_b_hi:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_xor_b16 v1.h, 0x8000, v1.h
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2
-;
-; GFX12-LABEL: v_fdot2_neg_b_hi:
-; GFX12:  ; %bb.0:
-; GFX12:    v_xor_b16 v1.h, 0x8000, v1.h
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
+; GFX1170-GFX12-LABEL: v_fdot2_neg_b_hi:
+; GFX1170-GFX12:  ; %bb.0:
+; GFX1170-GFX12:    v_xor_b16 v1.h, 0x8000, v1.h
+; GFX1170-GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
   %b_hi = extractelement <2 x half> %b, i32 1
   %neg.b_hi = fneg half %b_hi
   %neg_hi.b = insertelement <2 x half> %b, half %neg.b_hi, i32 1
@@ -255,15 +231,10 @@ define float @v_fdot2_opsel_lo_a(<2 x half> %a, <2 x 
half> %b, float %c) {
 ; GFX11:    v_dot2acc_f32_f16 v2, v0, v1
 ; GFX11:    v_mov_b32_e32 v0, v2
 ;
-; GFX1170-LABEL: v_fdot2_opsel_lo_a:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_mov_b16_e32 v0.l, v0.h
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2
-;
-; GFX12-LABEL: v_fdot2_opsel_lo_a:
-; GFX12:  ; %bb.0:
-; GFX12:    v_mov_b16_e32 v0.l, v0.h
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
+; GFX1170-GFX12-LABEL: v_fdot2_opsel_lo_a:
+; GFX1170-GFX12:  ; %bb.0:
+; GFX1170-GFX12:    v_mov_b16_e32 v0.l, v0.h
+; GFX1170-GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
   %shuf = shufflevector <2 x half> %a, <2 x half> poison, <2 x i32> <i32 1, 
i32 1>
   %r = call float @llvm.amdgcn.fdot2(<2 x half> %shuf, <2 x half> %b, float 
%c, i1 false)
   ret float %r
@@ -291,15 +262,10 @@ define float @v_fdot2_opsel_hi_a(<2 x half> %a, <2 x 
half> %b, float %c) {
 ; GFX11:    v_dot2acc_f32_f16 v2, v0, v1
 ; GFX11:    v_mov_b32_e32 v0, v2
 ;
-; GFX1170-LABEL: v_fdot2_opsel_hi_a:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_mov_b16_e32 v0.h, v0.l
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2
-;
-; GFX12-LABEL: v_fdot2_opsel_hi_a:
-; GFX12:  ; %bb.0:
-; GFX12:    v_mov_b16_e32 v0.h, v0.l
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
+; GFX1170-GFX12-LABEL: v_fdot2_opsel_hi_a:
+; GFX1170-GFX12:  ; %bb.0:
+; GFX1170-GFX12:    v_mov_b16_e32 v0.h, v0.l
+; GFX1170-GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
   %shuf = shufflevector <2 x half> %a, <2 x half> poison, <2 x i32> <i32 0, 
i32 0>
   %r = call float @llvm.amdgcn.fdot2(<2 x half> %shuf, <2 x half> %b, float 
%c, i1 false)
   ret float %r
@@ -327,15 +293,10 @@ define float @v_fdot2_opsel_lo_b(<2 x half> %a, <2 x 
half> %b, float %c) {
 ; GFX11:    v_dot2acc_f32_f16 v2, v0, v1
 ; GFX11:    v_mov_b32_e32 v0, v2
 ;
-; GFX1170-LABEL: v_fdot2_opsel_lo_b:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_mov_b16_e32 v1.l, v1.h
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2
-;
-; GFX12-LABEL: v_fdot2_opsel_lo_b:
-; GFX12:  ; %bb.0:
-; GFX12:    v_mov_b16_e32 v1.l, v1.h
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
+; GFX1170-GFX12-LABEL: v_fdot2_opsel_lo_b:
+; GFX1170-GFX12:  ; %bb.0:
+; GFX1170-GFX12:    v_mov_b16_e32 v1.l, v1.h
+; GFX1170-GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
   %shuf = shufflevector <2 x half> %b, <2 x half> poison, <2 x i32> <i32 1, 
i32 1>
   %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %shuf, float 
%c, i1 false)
   ret float %r
@@ -363,15 +324,10 @@ define float @v_fdot2_opsel_hi_b(<2 x half> %a, <2 x 
half> %b, float %c) {
 ; GFX11:    v_dot2acc_f32_f16 v2, v0, v1
 ; GFX11:    v_mov_b32_e32 v0, v2
 ;
-; GFX1170-LABEL: v_fdot2_opsel_hi_b:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_mov_b16_e32 v1.h, v1.l
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2
-;
-; GFX12-LABEL: v_fdot2_opsel_hi_b:
-; GFX12:  ; %bb.0:
-; GFX12:    v_mov_b16_e32 v1.h, v1.l
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
+; GFX1170-GFX12-LABEL: v_fdot2_opsel_hi_b:
+; GFX1170-GFX12:  ; %bb.0:
+; GFX1170-GFX12:    v_mov_b16_e32 v1.h, v1.l
+; GFX1170-GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
   %shuf = shufflevector <2 x half> %b, <2 x half> poison, <2 x i32> <i32 0, 
i32 0>
   %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %shuf, float 
%c, i1 false)
   ret float %r
@@ -397,13 +353,9 @@ define float @v_fdot2_inline_literal_a(<2 x half> %b, 
float %c) {
 ; GFX11:    v_dot2acc_f32_f16 v1, 0x40004000, v0
 ; GFX11:    v_mov_b32_e32 v0, v1
 ;
-; GFX1170-LABEL: v_fdot2_inline_literal_a:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_dot2_f32_f16 v0, 0x40004000, v0, v1
-;
-; GFX12-LABEL: v_fdot2_inline_literal_a:
-; GFX12:  ; %bb.0:
-; GFX12:    v_dot2_f32_f16 v0, 0x40004000, v0, v1
+; GFX1170-GFX12-LABEL: v_fdot2_inline_literal_a:
+; GFX1170-GFX12:  ; %bb.0:
+; GFX1170-GFX12:    v_dot2_f32_f16 v0, 0x40004000, v0, v1
   %ret = tail call float @llvm.amdgcn.fdot2(<2 x half> <half 2.0, half 2.0>, 
<2 x half> %b, float %c, i1 false)
   ret float %ret
 }
@@ -428,13 +380,9 @@ define float @v_fdot2_inline_literal_b(<2 x half> %a, 
float %c) {
 ; GFX11:    v_dot2acc_f32_f16 v1, 0x40004000, v0
 ; GFX11:    v_mov_b32_e32 v0, v1
 ;
-; GFX1170-LABEL: v_fdot2_inline_literal_b:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_dot2_f32_f16 v0, v0, 0x40004000, v1
-;
-; GFX12-LABEL: v_fdot2_inline_literal_b:
-; GFX12:  ; %bb.0:
-; GFX12:    v_dot2_f32_f16 v0, v0, 0x40004000, v1
+; GFX1170-GFX12-LABEL: v_fdot2_inline_literal_b:
+; GFX1170-GFX12:  ; %bb.0:
+; GFX1170-GFX12:    v_dot2_f32_f16 v0, 0x40004000, v0, v1
   %ret = tail call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> <half 
2.0, half 2.0>, float %c, i1 false)
   ret float %ret
 }
@@ -462,13 +410,9 @@ define float @v_fdot2_inline_literal_c(<2 x half> %a, <2 x 
half> %b) {
 ; GFX11:    v_dot2acc_f32_f16 v2, v0, v1
 ; GFX11:    v_mov_b32_e32 v0, v2
 ;
-; GFX1170-LABEL: v_fdot2_inline_literal_c:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, 2.0
-;
-; GFX12-LABEL: v_fdot2_inline_literal_c:
-; GFX12:  ; %bb.0:
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, 2.0
+; GFX1170-GFX12-LABEL: v_fdot2_inline_literal_c:
+; GFX1170-GFX12:  ; %bb.0:
+; GFX1170-GFX12:    v_dot2_f32_f16 v0, v0, v1, 2.0
   %ret = tail call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, 
float 2.0, i1 false)
   ret float %ret
 }
@@ -650,7 +594,7 @@ define float @v_fdot2_inline_literal_b_clamp(<2 x half> %a, 
float %c) {
 ;
 ; GFX11PLUS-LABEL: v_fdot2_inline_literal_b_clamp:
 ; GFX11PLUS:  ; %bb.0:
-; GFX11PLUS:    v_dot2_f32_f16 v0, v0, 0x40004000, v1 clamp
+; GFX11PLUS:    v_dot2_f32_f16 v0, 0x40004000, v0, v1 clamp
   %ret = tail call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> <half 
2.0, half 2.0>, float %c, i1 true)
   ret float %ret
 }
@@ -666,9 +610,9 @@ define float @v_fdot2_inline_literal_c_clamp(<2 x half> %a, 
<2 x half> %b) {
 define float @v_fdot2_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> 
%d, <2 x half> %e, float %f) {
 ; GFX906-LABEL: v_fdot2_dual:
 ; GFX906:  ; %bb.0:
-; GFX906:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX906:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX906:    v_add_f32_e32 v0, v0, v1
+; GFX906:    v_dot2_f32_f16 v2, v0, v1, v2
+; GFX906:    v_dot2_f32_f16 v5, v3, v4, v5
+; GFX906:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX950-LABEL: v_fdot2_dual:
 ; GFX950:  ; %bb.0:
@@ -682,22 +626,214 @@ define float @v_fdot2_dual(<2 x half> %a, <2 x half> %b, 
float %c, <2 x half> %d
 ; GFX10:    v_dot2c_f32_f16 v5, v3, v4
 ; GFX10:    v_add_f32_e32 v0, v2, v5
 ;
-; GFX11-LABEL: v_fdot2_dual:
+; GFX11PLUS-LABEL: v_fdot2_dual:
+; GFX11PLUS:  ; %bb.0:
+; GFX11PLUS:    v_dual_dot2acc_f32_f16 v2, v0, v1 :: v_dual_dot2acc_f32_f16 
v5, v3, v4
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
+  %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, 
i1 false)
+  %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, 
i1 false)
+  %r = fadd float %r0, %r1
+  ret float %r
+}
+
+define float @v_fdot2_dual_sgpr_src0_x(<2 x half> inreg %a, <2 x half> %b, 
float %c, <2 x half> %d, <2 x half> %e, float %f) {
+; GFX906-LABEL: v_fdot2_dual_sgpr_src0_x:
+; GFX906:  ; %bb.0:
+; GFX906:    v_dot2_f32_f16 v1, s16, v0, v1
+; GFX906:    v_dot2_f32_f16 v4, v2, v3, v4
+; GFX906:    v_add_f32_e32 v0, v1, v4
+;
+; GFX950-LABEL: v_fdot2_dual_sgpr_src0_x:
+; GFX950:  ; %bb.0:
+; GFX950:    v_dot2c_f32_f16_e32 v1, s0, v0
+; GFX950:    v_dot2c_f32_f16_e32 v4, v2, v3
+; GFX950:    v_add_f32_e32 v0, v1, v4
+;
+; GFX10-LABEL: v_fdot2_dual_sgpr_src0_x:
+; GFX10:  ; %bb.0:
+; GFX10:    v_dot2c_f32_f16 v1, s16, v0
+; GFX10:    v_dot2c_f32_f16 v4, v2, v3
+; GFX10:    v_add_f32_e32 v0, v1, v4
+;
+; GFX11PLUS-LABEL: v_fdot2_dual_sgpr_src0_x:
+; GFX11PLUS:  ; %bb.0:
+; GFX11PLUS:    v_dual_dot2acc_f32_f16 v1, s0, v0 :: v_dual_dot2acc_f32_f16 
v4, v2, v3
+; GFX11PLUS:    v_add_f32_e32 v0, v1, v4
+  %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, 
i1 false)
+  %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, 
i1 false)
+  %r = fadd float %r0, %r1
+  ret float %r
+}
+
+define float @v_fdot2_dual_sgpr_src1_x(<2 x half> %a, <2 x half> inreg %b, 
float %c, <2 x half> %d, <2 x half> %e, float %f) {
+; GFX906-LABEL: v_fdot2_dual_sgpr_src1_x:
+; GFX906:  ; %bb.0:
+; GFX906:    v_dot2_f32_f16 v1, v0, s16, v1
+; GFX906:    v_dot2_f32_f16 v4, v2, v3, v4
+; GFX906:    v_add_f32_e32 v0, v1, v4
+;
+; GFX950-LABEL: v_fdot2_dual_sgpr_src1_x:
+; GFX950:  ; %bb.0:
+; GFX950:    v_dot2c_f32_f16_e32 v1, s0, v0
+; GFX950:    v_dot2c_f32_f16_e32 v4, v2, v3
+; GFX950:    v_add_f32_e32 v0, v1, v4
+;
+; GFX10-LABEL: v_fdot2_dual_sgpr_src1_x:
+; GFX10:  ; %bb.0:
+; GFX10:    v_dot2c_f32_f16 v1, s16, v0
+; GFX10:    v_dot2c_f32_f16 v4, v2, v3
+; GFX10:    v_add_f32_e32 v0, v1, v4
+;
+; GFX11-LABEL: v_fdot2_dual_sgpr_src1_x:
+; GFX11:  ; %bb.0:
+; GFX11:    v_dual_dot2acc_f32_f16 v1, s0, v0 :: v_dual_dot2acc_f32_f16 v4, 
v2, v3
+; GFX11:    v_add_f32_e32 v0, v1, v4
+;
+; GFX1170-GFX12-LABEL: v_fdot2_dual_sgpr_src1_x:
+; GFX1170-GFX12:  ; %bb.0:
+; GFX1170-GFX12:    v_dot2_f32_f16 v1, v0, s0, v1
+; GFX1170-GFX12:    v_dot2_f32_f16 v4, v2, v3, v4
+; GFX1170-GFX12:    v_add_f32_e32 v0, v1, v4
+  %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, 
i1 false)
+  %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, 
i1 false)
+  %r = fadd float %r0, %r1
+  ret float %r
+}
+
+define float @v_fdot2_dual_sgpr_src2_x(<2 x half> %a, <2 x half> %b, float 
inreg %c, <2 x half> %d, <2 x half> %e, float %f) {
+; GFX906-LABEL: v_fdot2_dual_sgpr_src2_x:
+; GFX906:  ; %bb.0:
+; GFX906:    v_dot2_f32_f16 v0, v0, v1, s16
+; GFX906:    v_dot2_f32_f16 v4, v2, v3, v4
+; GFX906:    v_add_f32_e32 v0, v0, v4
+;
+; GFX950-LABEL: v_fdot2_dual_sgpr_src2_x:
+; GFX950:  ; %bb.0:
+; GFX950:    v_mov_b32_e32 v5, s0
+; GFX950:    v_dot2c_f32_f16_e32 v5, v0, v1
+; GFX950:    v_dot2c_f32_f16_e32 v4, v2, v3
+; GFX950:    v_add_f32_e32 v0, v5, v4
+;
+; GFX10-LABEL: v_fdot2_dual_sgpr_src2_x:
+; GFX10:  ; %bb.0:
+; GFX10:    v_mov_b32_e32 v5, s16
+; GFX10:    v_dot2c_f32_f16 v4, v2, v3
+; GFX10:    v_dot2c_f32_f16 v5, v0, v1
+; GFX10:    v_add_f32_e32 v0, v5, v4
+;
+; GFX11-LABEL: v_fdot2_dual_sgpr_src2_x:
+; GFX11:  ; %bb.0:
+; GFX11:    v_dual_mov_b32 v5, s0 :: v_dual_dot2acc_f32_f16 v4, v2, v3
+; GFX11:    v_dot2acc_f32_f16 v5, v0, v1
+; GFX11:    v_add_f32_e32 v0, v5, v4
+;
+; GFX1170-GFX12-LABEL: v_fdot2_dual_sgpr_src2_x:
+; GFX1170-GFX12:  ; %bb.0:
+; GFX1170-GFX12:    v_dot2_f32_f16 v0, v0, v1, s0
+; GFX1170-GFX12:    v_dot2_f32_f16 v4, v2, v3, v4
+; GFX1170-GFX12:    v_add_f32_e32 v0, v0, v4
+  %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, 
i1 false)
+  %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, 
i1 false)
+  %r = fadd float %r0, %r1
+  ret float %r
+}
+
+define float @v_fdot2_dual_sgpr_src0_y(<2 x half> %a, <2 x half> %b, float %c, 
<2 x half> inreg %d, <2 x half> %e, <2 x half> %vopd_dst_pad, float %f) {
+; GFX906-LABEL: v_fdot2_dual_sgpr_src0_y:
+; GFX906:  ; %bb.0:
+; GFX906:    v_dot2_f32_f16 v2, v0, v1, v2
+; GFX906:    v_dot2_f32_f16 v5, s16, v3, v5
+; GFX906:    v_add_f32_e32 v0, v2, v5
+;
+; GFX950-LABEL: v_fdot2_dual_sgpr_src0_y:
+; GFX950:  ; %bb.0:
+; GFX950:    v_dot2c_f32_f16_e32 v2, v0, v1
+; GFX950:    v_dot2c_f32_f16_e32 v5, s0, v3
+; GFX950:    v_add_f32_e32 v0, v2, v5
+;
+; GFX10-LABEL: v_fdot2_dual_sgpr_src0_y:
+; GFX10:  ; %bb.0:
+; GFX10:    v_dot2c_f32_f16 v2, v0, v1
+; GFX10:    v_dot2c_f32_f16 v5, s16, v3
+; GFX10:    v_add_f32_e32 v0, v2, v5
+;
+; GFX11PLUS-LABEL: v_fdot2_dual_sgpr_src0_y:
+; GFX11PLUS:  ; %bb.0:
+; GFX11PLUS:    v_dual_dot2acc_f32_f16 v2, v0, v1 :: v_dual_dot2acc_f32_f16 
v5, s0, v3
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
+  %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, 
i1 false)
+  %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, 
i1 false)
+  %r = fadd float %r0, %r1
+  ret float %r
+}
+
+define float @v_fdot2_dual_sgpr_src1_y(<2 x half> %a, <2 x half> %b, float %c, 
<2 x half> %d, <2 x half> %vopd_dst_pad, <2 x half> inreg %e, float %f) {
+; GFX906-LABEL: v_fdot2_dual_sgpr_src1_y:
+; GFX906:  ; %bb.0:
+; GFX906:    v_dot2_f32_f16 v2, v0, v1, v2
+; GFX906:    v_dot2_f32_f16 v5, v3, s16, v5
+; GFX906:    v_add_f32_e32 v0, v2, v5
+;
+; GFX950-LABEL: v_fdot2_dual_sgpr_src1_y:
+; GFX950:  ; %bb.0:
+; GFX950:    v_dot2c_f32_f16_e32 v2, v0, v1
+; GFX950:    v_dot2c_f32_f16_e32 v5, s0, v3
+; GFX950:    v_add_f32_e32 v0, v2, v5
+;
+; GFX10-LABEL: v_fdot2_dual_sgpr_src1_y:
+; GFX10:  ; %bb.0:
+; GFX10:    v_dot2c_f32_f16 v2, v0, v1
+; GFX10:    v_dot2c_f32_f16 v5, s16, v3
+; GFX10:    v_add_f32_e32 v0, v2, v5
+;
+; GFX11-LABEL: v_fdot2_dual_sgpr_src1_y:
+; GFX11:  ; %bb.0:
+; GFX11:    v_dual_dot2acc_f32_f16 v2, v0, v1 :: v_dual_dot2acc_f32_f16 v5, 
s0, v3
+; GFX11:    v_add_f32_e32 v0, v2, v5
+;
+; GFX1170-GFX12-LABEL: v_fdot2_dual_sgpr_src1_y:
+; GFX1170-GFX12:  ; %bb.0:
+; GFX1170-GFX12:    v_dot2_f32_f16 v2, v0, v1, v2
+; GFX1170-GFX12:    v_dot2_f32_f16 v5, v3, s0, v5
+; GFX1170-GFX12:    v_add_f32_e32 v0, v2, v5
+  %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, 
i1 false)
+  %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, 
i1 false)
+  %r = fadd float %r0, %r1
+  ret float %r
+}
+
+define float @v_fdot2_dual_sgpr_src2_y(<2 x half> %a, <2 x half> %b, float %c, 
<2 x half> %d, <2 x half> %e, float inreg %f) {
+; GFX906-LABEL: v_fdot2_dual_sgpr_src2_y:
+; GFX906:  ; %bb.0:
+; GFX906:    v_dot2_f32_f16 v2, v0, v1, v2
+; GFX906:    v_dot2_f32_f16 v0, v3, v4, s16
+; GFX906:    v_add_f32_e32 v0, v2, v0
+;
+; GFX950-LABEL: v_fdot2_dual_sgpr_src2_y:
+; GFX950:  ; %bb.0:
+; GFX950:    v_dot2c_f32_f16_e32 v2, v0, v1
+; GFX950:    v_mov_b32_e32 v0, s0
+; GFX950:    v_dot2c_f32_f16_e32 v0, v3, v4
+; GFX950:    v_add_f32_e32 v0, v2, v0
+;
+; GFX10-LABEL: v_fdot2_dual_sgpr_src2_y:
+; GFX10:  ; %bb.0:
+; GFX10:    v_mov_b32_e32 v5, s16
+; GFX10:    v_dot2c_f32_f16 v2, v0, v1
+; GFX10:    v_dot2c_f32_f16 v5, v3, v4
+; GFX10:    v_add_f32_e32 v0, v2, v5
+;
+; GFX11-LABEL: v_fdot2_dual_sgpr_src2_y:
 ; GFX11:  ; %bb.0:
-; GFX11:    v_dual_dot2acc_f32_f16 v2, v0, v1 :: v_dual_dot2acc_f32_f16 v5, 
v3, v4
+; GFX11:    v_dual_mov_b32 v5, s0 :: v_dual_dot2acc_f32_f16 v2, v0, v1
+; GFX11:    v_dot2acc_f32_f16 v5, v3, v4
 ; GFX11:    v_add_f32_e32 v0, v2, v5
 ;
-; GFX1170-LABEL: v_fdot2_dual:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX1170:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX1170:    v_add_f32_e32 v0, v0, v1
-;
-; GFX12-LABEL: v_fdot2_dual:
-; GFX12:  ; %bb.0:
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX12:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX1170-GFX12-LABEL: v_fdot2_dual_sgpr_src2_y:
+; GFX1170-GFX12:  ; %bb.0:
+; GFX1170-GFX12:    v_dot2_f32_f16 v2, v0, v1, v2
+; GFX1170-GFX12:    v_dot2_f32_f16 v0, v3, v4, s0
+; GFX1170-GFX12:    v_add_f32_e32 v0, v2, v0
   %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, 
i1 false)
   %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, 
i1 false)
   %r = fadd float %r0, %r1
@@ -707,39 +843,33 @@ define float @v_fdot2_dual(<2 x half> %a, <2 x half> %b, 
float %c, <2 x half> %d
 define float @v_fdot2_neg_a_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x 
half> %d, <2 x half> %e, float %f) {
 ; GFX906-LABEL: v_fdot2_neg_a_dual:
 ; GFX906:  ; %bb.0:
-; GFX906:    v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
-; GFX906:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX906:    v_add_f32_e32 v0, v0, v1
+; GFX906:    v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX906:    v_dot2_f32_f16 v5, v3, v4, v5
+; GFX906:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX950-LABEL: v_fdot2_neg_a_dual:
 ; GFX950:  ; %bb.0:
-; GFX950:    v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX950:    v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GFX950:    v_dot2c_f32_f16_e32 v5, v3, v4
-; GFX950:    v_add_f32_e32 v0, v0, v5
+; GFX950:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX10-LABEL: v_fdot2_neg_a_dual:
 ; GFX10:  ; %bb.0:
-; GFX10:    v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX10:    v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GFX10:    v_dot2c_f32_f16 v5, v3, v4
-; GFX10:    v_add_f32_e32 v0, v0, v5
+; GFX10:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX11-LABEL: v_fdot2_neg_a_dual:
 ; GFX11:  ; %bb.0:
-; GFX11:    v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX11:    v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GFX11:    v_dot2acc_f32_f16 v5, v3, v4
-; GFX11:    v_add_f32_e32 v0, v0, v5
-;
-; GFX1170-LABEL: v_fdot2_neg_a_dual:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
-; GFX1170:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX1170:    v_add_f32_e32 v0, v0, v1
-;
-; GFX12-LABEL: v_fdot2_neg_a_dual:
-; GFX12:  ; %bb.0:
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
-; GFX12:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX11:    v_add_f32_e32 v0, v2, v5
+;
+; GFX1170-GFX12-LABEL: v_fdot2_neg_a_dual:
+; GFX1170-GFX12:  ; %bb.0:
+; GFX1170-GFX12:    v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX1170-GFX12:    v_dot2_f32_f16 v5, v3, v4, v5
+; GFX1170-GFX12:    v_add_f32_e32 v0, v2, v5
   %neg.a = fneg <2 x half> %a
   %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %neg.a, <2 x half> %b, float 
%c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, 
i1 false)
@@ -750,9 +880,9 @@ define float @v_fdot2_neg_a_dual(<2 x half> %a, <2 x half> 
%b, float %c, <2 x ha
 define float @v_fdot2_neg_a_lo_dual(<2 x half> %a, <2 x half> %b, float %c, <2 
x half> %d, <2 x half> %e, float %f) {
 ; GFX906-LABEL: v_fdot2_neg_a_lo_dual:
 ; GFX906:  ; %bb.0:
-; GFX906:    v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0]
-; GFX906:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX906:    v_add_f32_e32 v0, v0, v1
+; GFX906:    v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[1,0,0]
+; GFX906:    v_dot2_f32_f16 v5, v3, v4, v5
+; GFX906:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX950-LABEL: v_fdot2_neg_a_lo_dual:
 ; GFX950:  ; %bb.0:
@@ -765,29 +895,15 @@ define float @v_fdot2_neg_a_lo_dual(<2 x half> %a, <2 x 
half> %b, float %c, <2 x
 ;
 ; GFX10-LABEL: v_fdot2_neg_a_lo_dual:
 ; GFX10:  ; %bb.0:
-; GFX10:    v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0]
+; GFX10:    v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[1,0,0]
 ; GFX10:    v_dot2c_f32_f16 v5, v3, v4
-; GFX10:    v_add_f32_e32 v0, v0, v5
-;
-; GFX11-LABEL: v_fdot2_neg_a_lo_dual:
-; GFX11:  ; %bb.0:
-; GFX11:    v_xor_b16 v0.l, 0x8000, v0.l
-; GFX11:    v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, 
v0, v1
-; GFX11:    v_add_f32_e32 v0, v2, v5
+; GFX10:    v_add_f32_e32 v0, v2, v5
 ;
-; GFX1170-LABEL: v_fdot2_neg_a_lo_dual:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_xor_b16 v0.l, 0x8000, v0.l
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX1170:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX1170:    v_add_f32_e32 v0, v0, v1
-;
-; GFX12-LABEL: v_fdot2_neg_a_lo_dual:
-; GFX12:  ; %bb.0:
-; GFX12:    v_xor_b16 v0.l, 0x8000, v0.l
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX12:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS-LABEL: v_fdot2_neg_a_lo_dual:
+; GFX11PLUS:  ; %bb.0:
+; GFX11PLUS:    v_xor_b16 v0.l, 0x8000, v0.l
+; GFX11PLUS:    v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 
v2, v0, v1
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
   %a_lo = extractelement <2 x half> %a, i32 0
   %neg.a_lo = fneg half %a_lo
   %neg_lo.a = insertelement <2 x half> %a, half %neg.a_lo, i32 0
@@ -800,9 +916,9 @@ define float @v_fdot2_neg_a_lo_dual(<2 x half> %a, <2 x 
half> %b, float %c, <2 x
 define float @v_fdot2_neg_a_hi_dual(<2 x half> %a, <2 x half> %b, float %c, <2 
x half> %d, <2 x half> %e, float %f) {
 ; GFX906-LABEL: v_fdot2_neg_a_hi_dual:
 ; GFX906:  ; %bb.0:
-; GFX906:    v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[1,0,0]
-; GFX906:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX906:    v_add_f32_e32 v0, v0, v1
+; GFX906:    v_dot2_f32_f16 v2, v0, v1, v2 neg_hi:[1,0,0]
+; GFX906:    v_dot2_f32_f16 v5, v3, v4, v5
+; GFX906:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX950-LABEL: v_fdot2_neg_a_hi_dual:
 ; GFX950:  ; %bb.0:
@@ -816,29 +932,15 @@ define float @v_fdot2_neg_a_hi_dual(<2 x half> %a, <2 x 
half> %b, float %c, <2 x
 ;
 ; GFX10-LABEL: v_fdot2_neg_a_hi_dual:
 ; GFX10:  ; %bb.0:
-; GFX10:    v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[1,0,0]
+; GFX10:    v_dot2_f32_f16 v2, v0, v1, v2 neg_hi:[1,0,0]
 ; GFX10:    v_dot2c_f32_f16 v5, v3, v4
-; GFX10:    v_add_f32_e32 v0, v0, v5
-;
-; GFX11-LABEL: v_fdot2_neg_a_hi_dual:
-; GFX11:  ; %bb.0:
-; GFX11:    v_xor_b16 v0.h, 0x8000, v0.h
-; GFX11:    v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, 
v0, v1
-; GFX11:    v_add_f32_e32 v0, v2, v5
+; GFX10:    v_add_f32_e32 v0, v2, v5
 ;
-; GFX1170-LABEL: v_fdot2_neg_a_hi_dual:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_xor_b16 v0.h, 0x8000, v0.h
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX1170:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX1170:    v_add_f32_e32 v0, v0, v1
-;
-; GFX12-LABEL: v_fdot2_neg_a_hi_dual:
-; GFX12:  ; %bb.0:
-; GFX12:    v_xor_b16 v0.h, 0x8000, v0.h
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX12:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS-LABEL: v_fdot2_neg_a_hi_dual:
+; GFX11PLUS:  ; %bb.0:
+; GFX11PLUS:    v_xor_b16 v0.h, 0x8000, v0.h
+; GFX11PLUS:    v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 
v2, v0, v1
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
   %a_hi = extractelement <2 x half> %a, i32 1
   %neg.a_hi = fneg half %a_hi
   %neg_hi.a = insertelement <2 x half> %a, half %neg.a_hi, i32 1
@@ -851,39 +953,33 @@ define float @v_fdot2_neg_a_hi_dual(<2 x half> %a, <2 x 
half> %b, float %c, <2 x
 define float @v_fdot2_neg_b_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x 
half> %d, <2 x half> %e, float %f) {
 ; GFX906-LABEL: v_fdot2_neg_b_dual:
 ; GFX906:  ; %bb.0:
-; GFX906:    v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
-; GFX906:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX906:    v_add_f32_e32 v0, v0, v1
+; GFX906:    v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX906:    v_dot2_f32_f16 v5, v3, v4, v5
+; GFX906:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX950-LABEL: v_fdot2_neg_b_dual:
 ; GFX950:  ; %bb.0:
-; GFX950:    v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX950:    v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
 ; GFX950:    v_dot2c_f32_f16_e32 v5, v3, v4
-; GFX950:    v_add_f32_e32 v0, v0, v5
+; GFX950:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX10-LABEL: v_fdot2_neg_b_dual:
 ; GFX10:  ; %bb.0:
-; GFX10:    v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX10:    v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
 ; GFX10:    v_dot2c_f32_f16 v5, v3, v4
-; GFX10:    v_add_f32_e32 v0, v0, v5
+; GFX10:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX11-LABEL: v_fdot2_neg_b_dual:
 ; GFX11:  ; %bb.0:
-; GFX11:    v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX11:    v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
 ; GFX11:    v_dot2acc_f32_f16 v5, v3, v4
-; GFX11:    v_add_f32_e32 v0, v0, v5
-;
-; GFX1170-LABEL: v_fdot2_neg_b_dual:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
-; GFX1170:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX1170:    v_add_f32_e32 v0, v0, v1
-;
-; GFX12-LABEL: v_fdot2_neg_b_dual:
-; GFX12:  ; %bb.0:
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
-; GFX12:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX11:    v_add_f32_e32 v0, v2, v5
+;
+; GFX1170-GFX12-LABEL: v_fdot2_neg_b_dual:
+; GFX1170-GFX12:  ; %bb.0:
+; GFX1170-GFX12:    v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX1170-GFX12:    v_dot2_f32_f16 v5, v3, v4, v5
+; GFX1170-GFX12:    v_add_f32_e32 v0, v2, v5
   %neg.b = fneg <2 x half> %b
   %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %neg.b, float 
%c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, 
i1 false)
@@ -894,9 +990,9 @@ define float @v_fdot2_neg_b_dual(<2 x half> %a, <2 x half> 
%b, float %c, <2 x ha
 define float @v_fdot2_neg_b_lo_dual(<2 x half> %a, <2 x half> %b, float %c, <2 
x half> %d, <2 x half> %e, float %f) {
 ; GFX906-LABEL: v_fdot2_neg_b_lo_dual:
 ; GFX906:  ; %bb.0:
-; GFX906:    v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0]
-; GFX906:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX906:    v_add_f32_e32 v0, v0, v1
+; GFX906:    v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[0,1,0]
+; GFX906:    v_dot2_f32_f16 v5, v3, v4, v5
+; GFX906:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX950-LABEL: v_fdot2_neg_b_lo_dual:
 ; GFX950:  ; %bb.0:
@@ -909,29 +1005,15 @@ define float @v_fdot2_neg_b_lo_dual(<2 x half> %a, <2 x 
half> %b, float %c, <2 x
 ;
 ; GFX10-LABEL: v_fdot2_neg_b_lo_dual:
 ; GFX10:  ; %bb.0:
-; GFX10:    v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0]
+; GFX10:    v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[0,1,0]
 ; GFX10:    v_dot2c_f32_f16 v5, v3, v4
-; GFX10:    v_add_f32_e32 v0, v0, v5
-;
-; GFX11-LABEL: v_fdot2_neg_b_lo_dual:
-; GFX11:  ; %bb.0:
-; GFX11:    v_xor_b16 v1.l, 0x8000, v1.l
-; GFX11:    v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, 
v0, v1
-; GFX11:    v_add_f32_e32 v0, v2, v5
+; GFX10:    v_add_f32_e32 v0, v2, v5
 ;
-; GFX1170-LABEL: v_fdot2_neg_b_lo_dual:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_xor_b16 v1.l, 0x8000, v1.l
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX1170:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX1170:    v_add_f32_e32 v0, v0, v1
-;
-; GFX12-LABEL: v_fdot2_neg_b_lo_dual:
-; GFX12:  ; %bb.0:
-; GFX12:    v_xor_b16 v1.l, 0x8000, v1.l
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX12:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS-LABEL: v_fdot2_neg_b_lo_dual:
+; GFX11PLUS:  ; %bb.0:
+; GFX11PLUS:    v_xor_b16 v1.l, 0x8000, v1.l
+; GFX11PLUS:    v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 
v2, v0, v1
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
   %b_lo = extractelement <2 x half> %b, i32 0
   %neg.b_lo = fneg half %b_lo
   %neg_lo.b = insertelement <2 x half> %b, half %neg.b_lo, i32 0
@@ -944,9 +1026,9 @@ define float @v_fdot2_neg_b_lo_dual(<2 x half> %a, <2 x 
half> %b, float %c, <2 x
 define float @v_fdot2_neg_b_hi_dual(<2 x half> %a, <2 x half> %b, float %c, <2 
x half> %d, <2 x half> %e, float %f) {
 ; GFX906-LABEL: v_fdot2_neg_b_hi_dual:
 ; GFX906:  ; %bb.0:
-; GFX906:    v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,1,0]
-; GFX906:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX906:    v_add_f32_e32 v0, v0, v1
+; GFX906:    v_dot2_f32_f16 v2, v0, v1, v2 neg_hi:[0,1,0]
+; GFX906:    v_dot2_f32_f16 v5, v3, v4, v5
+; GFX906:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX950-LABEL: v_fdot2_neg_b_hi_dual:
 ; GFX950:  ; %bb.0:
@@ -960,29 +1042,15 @@ define float @v_fdot2_neg_b_hi_dual(<2 x half> %a, <2 x 
half> %b, float %c, <2 x
 ;
 ; GFX10-LABEL: v_fdot2_neg_b_hi_dual:
 ; GFX10:  ; %bb.0:
-; GFX10:    v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,1,0]
+; GFX10:    v_dot2_f32_f16 v2, v0, v1, v2 neg_hi:[0,1,0]
 ; GFX10:    v_dot2c_f32_f16 v5, v3, v4
-; GFX10:    v_add_f32_e32 v0, v0, v5
-;
-; GFX11-LABEL: v_fdot2_neg_b_hi_dual:
-; GFX11:  ; %bb.0:
-; GFX11:    v_xor_b16 v1.h, 0x8000, v1.h
-; GFX11:    v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, 
v0, v1
-; GFX11:    v_add_f32_e32 v0, v2, v5
+; GFX10:    v_add_f32_e32 v0, v2, v5
 ;
-; GFX1170-LABEL: v_fdot2_neg_b_hi_dual:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_xor_b16 v1.h, 0x8000, v1.h
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX1170:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX1170:    v_add_f32_e32 v0, v0, v1
-;
-; GFX12-LABEL: v_fdot2_neg_b_hi_dual:
-; GFX12:  ; %bb.0:
-; GFX12:    v_xor_b16 v1.h, 0x8000, v1.h
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX12:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS-LABEL: v_fdot2_neg_b_hi_dual:
+; GFX11PLUS:  ; %bb.0:
+; GFX11PLUS:    v_xor_b16 v1.h, 0x8000, v1.h
+; GFX11PLUS:    v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 
v2, v0, v1
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
   %b_hi = extractelement <2 x half> %b, i32 1
   %neg.b_hi = fneg half %b_hi
   %neg_hi.b = insertelement <2 x half> %b, half %neg.b_hi, i32 1
@@ -995,39 +1063,33 @@ define float @v_fdot2_neg_b_hi_dual(<2 x half> %a, <2 x 
half> %b, float %c, <2 x
 define float @v_fdot2_neg_c_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x 
half> %d, <2 x half> %e, float %f) {
 ; GFX906-LABEL: v_fdot2_neg_c_dual:
 ; GFX906:  ; %bb.0:
-; GFX906:    v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1]
-; GFX906:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX906:    v_add_f32_e32 v0, v0, v1
+; GFX906:    v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[0,0,1]
+; GFX906:    v_dot2_f32_f16 v5, v3, v4, v5
+; GFX906:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX950-LABEL: v_fdot2_neg_c_dual:
 ; GFX950:  ; %bb.0:
-; GFX950:    v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1]
+; GFX950:    v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[0,0,1]
 ; GFX950:    v_dot2c_f32_f16_e32 v5, v3, v4
-; GFX950:    v_add_f32_e32 v0, v0, v5
+; GFX950:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX10-LABEL: v_fdot2_neg_c_dual:
 ; GFX10:  ; %bb.0:
-; GFX10:    v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1]
+; GFX10:    v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[0,0,1]
 ; GFX10:    v_dot2c_f32_f16 v5, v3, v4
-; GFX10:    v_add_f32_e32 v0, v0, v5
+; GFX10:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX11-LABEL: v_fdot2_neg_c_dual:
 ; GFX11:  ; %bb.0:
-; GFX11:    v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1]
+; GFX11:    v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[0,0,1]
 ; GFX11:    v_dot2acc_f32_f16 v5, v3, v4
-; GFX11:    v_add_f32_e32 v0, v0, v5
-;
-; GFX1170-LABEL: v_fdot2_neg_c_dual:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1]
-; GFX1170:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX1170:    v_add_f32_e32 v0, v0, v1
-;
-; GFX12-LABEL: v_fdot2_neg_c_dual:
-; GFX12:  ; %bb.0:
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1]
-; GFX12:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX11:    v_add_f32_e32 v0, v2, v5
+;
+; GFX1170-GFX12-LABEL: v_fdot2_neg_c_dual:
+; GFX1170-GFX12:  ; %bb.0:
+; GFX1170-GFX12:    v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[0,0,1]
+; GFX1170-GFX12:    v_dot2_f32_f16 v5, v3, v4, v5
+; GFX1170-GFX12:    v_add_f32_e32 v0, v2, v5
   %neg.c = fneg float %c
   %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float 
%neg.c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, 
i1 false)
@@ -1038,39 +1100,33 @@ define float @v_fdot2_neg_c_dual(<2 x half> %a, <2 x 
half> %b, float %c, <2 x ha
 define float @v_fdot2_abs_c_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x 
half> %d, <2 x half> %e, float %f) {
 ; GFX906-LABEL: v_fdot2_abs_c_dual:
 ; GFX906:  ; %bb.0:
-; GFX906:    v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,0,1]
-; GFX906:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX906:    v_add_f32_e32 v0, v0, v1
+; GFX906:    v_dot2_f32_f16 v2, v0, v1, v2 neg_hi:[0,0,1]
+; GFX906:    v_dot2_f32_f16 v5, v3, v4, v5
+; GFX906:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX950-LABEL: v_fdot2_abs_c_dual:
 ; GFX950:  ; %bb.0:
-; GFX950:    v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,0,1]
+; GFX950:    v_dot2_f32_f16 v2, v0, v1, v2 neg_hi:[0,0,1]
 ; GFX950:    v_dot2c_f32_f16_e32 v5, v3, v4
-; GFX950:    v_add_f32_e32 v0, v0, v5
+; GFX950:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX10-LABEL: v_fdot2_abs_c_dual:
 ; GFX10:  ; %bb.0:
-; GFX10:    v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,0,1]
+; GFX10:    v_dot2_f32_f16 v2, v0, v1, v2 neg_hi:[0,0,1]
 ; GFX10:    v_dot2c_f32_f16 v5, v3, v4
-; GFX10:    v_add_f32_e32 v0, v0, v5
+; GFX10:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX11-LABEL: v_fdot2_abs_c_dual:
 ; GFX11:  ; %bb.0:
-; GFX11:    v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,0,1]
+; GFX11:    v_dot2_f32_f16 v2, v0, v1, v2 neg_hi:[0,0,1]
 ; GFX11:    v_dot2acc_f32_f16 v5, v3, v4
-; GFX11:    v_add_f32_e32 v0, v0, v5
-;
-; GFX1170-LABEL: v_fdot2_abs_c_dual:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,0,1]
-; GFX1170:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX1170:    v_add_f32_e32 v0, v0, v1
-;
-; GFX12-LABEL: v_fdot2_abs_c_dual:
-; GFX12:  ; %bb.0:
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,0,1]
-; GFX12:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX11:    v_add_f32_e32 v0, v2, v5
+;
+; GFX1170-GFX12-LABEL: v_fdot2_abs_c_dual:
+; GFX1170-GFX12:  ; %bb.0:
+; GFX1170-GFX12:    v_dot2_f32_f16 v2, v0, v1, v2 neg_hi:[0,0,1]
+; GFX1170-GFX12:    v_dot2_f32_f16 v5, v3, v4, v5
+; GFX1170-GFX12:    v_add_f32_e32 v0, v2, v5
   %abs.c = call float @llvm.fabs.f32(float %c)
   %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float 
%abs.c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, 
i1 false)
@@ -1081,9 +1137,9 @@ define float @v_fdot2_abs_c_dual(<2 x half> %a, <2 x 
half> %b, float %c, <2 x ha
 define float @v_fdot2_opsel_lo_a_dual(<2 x half> %a, <2 x half> %b, float %c, 
<2 x half> %d, <2 x half> %e, float %f) {
 ; GFX906-LABEL: v_fdot2_opsel_lo_a_dual:
 ; GFX906:  ; %bb.0:
-; GFX906:    v_dot2_f32_f16 v0, v0, v1, v2 op_sel:[1,0,0]
-; GFX906:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX906:    v_add_f32_e32 v0, v0, v1
+; GFX906:    v_dot2_f32_f16 v2, v0, v1, v2 op_sel:[1,0,0]
+; GFX906:    v_dot2_f32_f16 v5, v3, v4, v5
+; GFX906:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX950-LABEL: v_fdot2_opsel_lo_a_dual:
 ; GFX950:  ; %bb.0:
@@ -1095,29 +1151,15 @@ define float @v_fdot2_opsel_lo_a_dual(<2 x half> %a, <2 
x half> %b, float %c, <2
 ;
 ; GFX10-LABEL: v_fdot2_opsel_lo_a_dual:
 ; GFX10:  ; %bb.0:
-; GFX10:    v_dot2_f32_f16 v0, v0, v1, v2 op_sel:[1,0,0]
+; GFX10:    v_dot2_f32_f16 v2, v0, v1, v2 op_sel:[1,0,0]
 ; GFX10:    v_dot2c_f32_f16 v5, v3, v4
-; GFX10:    v_add_f32_e32 v0, v0, v5
-;
-; GFX11-LABEL: v_fdot2_opsel_lo_a_dual:
-; GFX11:  ; %bb.0:
-; GFX11:    v_mov_b16_e32 v0.l, v0.h
-; GFX11:    v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, 
v0, v1
-; GFX11:    v_add_f32_e32 v0, v2, v5
+; GFX10:    v_add_f32_e32 v0, v2, v5
 ;
-; GFX1170-LABEL: v_fdot2_opsel_lo_a_dual:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_mov_b16_e32 v0.l, v0.h
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX1170:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX1170:    v_add_f32_e32 v0, v0, v1
-;
-; GFX12-LABEL: v_fdot2_opsel_lo_a_dual:
-; GFX12:  ; %bb.0:
-; GFX12:    v_mov_b16_e32 v0.l, v0.h
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX12:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS-LABEL: v_fdot2_opsel_lo_a_dual:
+; GFX11PLUS:  ; %bb.0:
+; GFX11PLUS:    v_mov_b16_e32 v0.l, v0.h
+; GFX11PLUS:    v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 
v2, v0, v1
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
   %shuf = shufflevector <2 x half> %a, <2 x half> poison, <2 x i32> <i32 1, 
i32 1>
   %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %shuf, <2 x half> %b, float 
%c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, 
i1 false)
@@ -1128,9 +1170,9 @@ define float @v_fdot2_opsel_lo_a_dual(<2 x half> %a, <2 x 
half> %b, float %c, <2
 define float @v_fdot2_opsel_hi_a_dual(<2 x half> %a, <2 x half> %b, float %c, 
<2 x half> %d, <2 x half> %e, float %f) {
 ; GFX906-LABEL: v_fdot2_opsel_hi_a_dual:
 ; GFX906:  ; %bb.0:
-; GFX906:    v_dot2_f32_f16 v0, v0, v1, v2 op_sel_hi:[0,1,1]
-; GFX906:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX906:    v_add_f32_e32 v0, v0, v1
+; GFX906:    v_dot2_f32_f16 v2, v0, v1, v2 op_sel_hi:[0,1,1]
+; GFX906:    v_dot2_f32_f16 v5, v3, v4, v5
+; GFX906:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX950-LABEL: v_fdot2_opsel_hi_a_dual:
 ; GFX950:  ; %bb.0:
@@ -1142,29 +1184,15 @@ define float @v_fdot2_opsel_hi_a_dual(<2 x half> %a, <2 
x half> %b, float %c, <2
 ;
 ; GFX10-LABEL: v_fdot2_opsel_hi_a_dual:
 ; GFX10:  ; %bb.0:
-; GFX10:    v_dot2_f32_f16 v0, v0, v1, v2 op_sel_hi:[0,1,1]
+; GFX10:    v_dot2_f32_f16 v2, v0, v1, v2 op_sel_hi:[0,1,1]
 ; GFX10:    v_dot2c_f32_f16 v5, v3, v4
-; GFX10:    v_add_f32_e32 v0, v0, v5
-;
-; GFX11-LABEL: v_fdot2_opsel_hi_a_dual:
-; GFX11:  ; %bb.0:
-; GFX11:    v_mov_b16_e32 v0.h, v0.l
-; GFX11:    v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, 
v0, v1
-; GFX11:    v_add_f32_e32 v0, v2, v5
+; GFX10:    v_add_f32_e32 v0, v2, v5
 ;
-; GFX1170-LABEL: v_fdot2_opsel_hi_a_dual:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_mov_b16_e32 v0.h, v0.l
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX1170:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX1170:    v_add_f32_e32 v0, v0, v1
-;
-; GFX12-LABEL: v_fdot2_opsel_hi_a_dual:
-; GFX12:  ; %bb.0:
-; GFX12:    v_mov_b16_e32 v0.h, v0.l
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX12:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS-LABEL: v_fdot2_opsel_hi_a_dual:
+; GFX11PLUS:  ; %bb.0:
+; GFX11PLUS:    v_mov_b16_e32 v0.h, v0.l
+; GFX11PLUS:    v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 
v2, v0, v1
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
   %shuf = shufflevector <2 x half> %a, <2 x half> poison, <2 x i32> <i32 0, 
i32 0>
   %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %shuf, <2 x half> %b, float 
%c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, 
i1 false)
@@ -1175,9 +1203,9 @@ define float @v_fdot2_opsel_hi_a_dual(<2 x half> %a, <2 x 
half> %b, float %c, <2
 define float @v_fdot2_opsel_lo_b_dual(<2 x half> %a, <2 x half> %b, float %c, 
<2 x half> %d, <2 x half> %e, float %f) {
 ; GFX906-LABEL: v_fdot2_opsel_lo_b_dual:
 ; GFX906:  ; %bb.0:
-; GFX906:    v_dot2_f32_f16 v0, v0, v1, v2 op_sel:[0,1,0]
-; GFX906:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX906:    v_add_f32_e32 v0, v0, v1
+; GFX906:    v_dot2_f32_f16 v2, v0, v1, v2 op_sel:[0,1,0]
+; GFX906:    v_dot2_f32_f16 v5, v3, v4, v5
+; GFX906:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX950-LABEL: v_fdot2_opsel_lo_b_dual:
 ; GFX950:  ; %bb.0:
@@ -1189,29 +1217,15 @@ define float @v_fdot2_opsel_lo_b_dual(<2 x half> %a, <2 
x half> %b, float %c, <2
 ;
 ; GFX10-LABEL: v_fdot2_opsel_lo_b_dual:
 ; GFX10:  ; %bb.0:
-; GFX10:    v_dot2_f32_f16 v0, v0, v1, v2 op_sel:[0,1,0]
+; GFX10:    v_dot2_f32_f16 v2, v0, v1, v2 op_sel:[0,1,0]
 ; GFX10:    v_dot2c_f32_f16 v5, v3, v4
-; GFX10:    v_add_f32_e32 v0, v0, v5
-;
-; GFX11-LABEL: v_fdot2_opsel_lo_b_dual:
-; GFX11:  ; %bb.0:
-; GFX11:    v_mov_b16_e32 v1.l, v1.h
-; GFX11:    v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, 
v0, v1
-; GFX11:    v_add_f32_e32 v0, v2, v5
+; GFX10:    v_add_f32_e32 v0, v2, v5
 ;
-; GFX1170-LABEL: v_fdot2_opsel_lo_b_dual:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_mov_b16_e32 v1.l, v1.h
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX1170:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX1170:    v_add_f32_e32 v0, v0, v1
-;
-; GFX12-LABEL: v_fdot2_opsel_lo_b_dual:
-; GFX12:  ; %bb.0:
-; GFX12:    v_mov_b16_e32 v1.l, v1.h
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX12:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS-LABEL: v_fdot2_opsel_lo_b_dual:
+; GFX11PLUS:  ; %bb.0:
+; GFX11PLUS:    v_mov_b16_e32 v1.l, v1.h
+; GFX11PLUS:    v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 
v2, v0, v1
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
   %shuf = shufflevector <2 x half> %b, <2 x half> poison, <2 x i32> <i32 1, 
i32 1>
   %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %shuf, float 
%c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, 
i1 false)
@@ -1222,9 +1236,9 @@ define float @v_fdot2_opsel_lo_b_dual(<2 x half> %a, <2 x 
half> %b, float %c, <2
 define float @v_fdot2_opsel_hi_b_dual(<2 x half> %a, <2 x half> %b, float %c, 
<2 x half> %d, <2 x half> %e, float %f) {
 ; GFX906-LABEL: v_fdot2_opsel_hi_b_dual:
 ; GFX906:  ; %bb.0:
-; GFX906:    v_dot2_f32_f16 v0, v0, v1, v2 op_sel_hi:[1,0,1]
-; GFX906:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX906:    v_add_f32_e32 v0, v0, v1
+; GFX906:    v_dot2_f32_f16 v2, v0, v1, v2 op_sel_hi:[1,0,1]
+; GFX906:    v_dot2_f32_f16 v5, v3, v4, v5
+; GFX906:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX950-LABEL: v_fdot2_opsel_hi_b_dual:
 ; GFX950:  ; %bb.0:
@@ -1236,29 +1250,15 @@ define float @v_fdot2_opsel_hi_b_dual(<2 x half> %a, <2 
x half> %b, float %c, <2
 ;
 ; GFX10-LABEL: v_fdot2_opsel_hi_b_dual:
 ; GFX10:  ; %bb.0:
-; GFX10:    v_dot2_f32_f16 v0, v0, v1, v2 op_sel_hi:[1,0,1]
+; GFX10:    v_dot2_f32_f16 v2, v0, v1, v2 op_sel_hi:[1,0,1]
 ; GFX10:    v_dot2c_f32_f16 v5, v3, v4
-; GFX10:    v_add_f32_e32 v0, v0, v5
-;
-; GFX11-LABEL: v_fdot2_opsel_hi_b_dual:
-; GFX11:  ; %bb.0:
-; GFX11:    v_mov_b16_e32 v1.h, v1.l
-; GFX11:    v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, 
v0, v1
-; GFX11:    v_add_f32_e32 v0, v2, v5
+; GFX10:    v_add_f32_e32 v0, v2, v5
 ;
-; GFX1170-LABEL: v_fdot2_opsel_hi_b_dual:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_mov_b16_e32 v1.h, v1.l
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX1170:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX1170:    v_add_f32_e32 v0, v0, v1
-;
-; GFX12-LABEL: v_fdot2_opsel_hi_b_dual:
-; GFX12:  ; %bb.0:
-; GFX12:    v_mov_b16_e32 v1.h, v1.l
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX12:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS-LABEL: v_fdot2_opsel_hi_b_dual:
+; GFX11PLUS:  ; %bb.0:
+; GFX11PLUS:    v_mov_b16_e32 v1.h, v1.l
+; GFX11PLUS:    v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 
v2, v0, v1
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
   %shuf = shufflevector <2 x half> %b, <2 x half> poison, <2 x i32> <i32 0, 
i32 0>
   %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %shuf, float 
%c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, 
i1 false)
@@ -1272,9 +1272,9 @@ define float @v_fdot2_opsel_hi_b_dual(<2 x half> %a, <2 x 
half> %b, float %c, <2
 define float @v_fdot2_inline_literal_a_x(<2 x half> %a, <2 x half> %b, float 
%c, <2 x half> %d, <2 x half> %e, float %f) {
 ; GFX906-LABEL: v_fdot2_inline_literal_a_x:
 ; GFX906:  ; %bb.0:
-; GFX906:    v_dot2_f32_f16 v0, 2.0, v1, v2 op_sel_hi:[0,1,1]
-; GFX906:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX906:    v_add_f32_e32 v0, v0, v1
+; GFX906:    v_dot2_f32_f16 v2, 2.0, v1, v2 op_sel_hi:[0,1,1]
+; GFX906:    v_dot2_f32_f16 v5, v3, v4, v5
+; GFX906:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX950-LABEL: v_fdot2_inline_literal_a_x:
 ; GFX950:  ; %bb.0:
@@ -1288,22 +1288,10 @@ define float @v_fdot2_inline_literal_a_x(<2 x half> %a, 
<2 x half> %b, float %c,
 ; GFX10:    v_dot2c_f32_f16 v5, v3, v4
 ; GFX10:    v_add_f32_e32 v0, v2, v5
 ;
-; GFX11-LABEL: v_fdot2_inline_literal_a_x:
-; GFX11:  ; %bb.0:
-; GFX11:    v_dual_dot2acc_f32_f16 v2, 0x40004000, v1 :: 
v_dual_dot2acc_f32_f16 v5, v3, v4
-; GFX11:    v_add_f32_e32 v0, v2, v5
-;
-; GFX1170-LABEL: v_fdot2_inline_literal_a_x:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_dot2_f32_f16 v0, 0x40004000, v1, v2
-; GFX1170:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX1170:    v_add_f32_e32 v0, v0, v1
-;
-; GFX12-LABEL: v_fdot2_inline_literal_a_x:
-; GFX12:  ; %bb.0:
-; GFX12:    v_dot2_f32_f16 v0, 0x40004000, v1, v2
-; GFX12:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS-LABEL: v_fdot2_inline_literal_a_x:
+; GFX11PLUS:  ; %bb.0:
+; GFX11PLUS:    v_dual_dot2acc_f32_f16 v2, 0x40004000, v1 :: 
v_dual_dot2acc_f32_f16 v5, v3, v4
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
   %r0 = call float @llvm.amdgcn.fdot2(<2 x half> <half 2.0, half 2.0>, <2 x 
half> %b, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, 
i1 false)
   %r = fadd float %r0, %r1
@@ -1313,9 +1301,9 @@ define float @v_fdot2_inline_literal_a_x(<2 x half> %a, 
<2 x half> %b, float %c,
 define float @v_fdot2_inline_literal_a_y(<2 x half> %a, <2 x half> %b, float 
%c, <2 x half> %d, <2 x half> %e, float %f) {
 ; GFX906-LABEL: v_fdot2_inline_literal_a_y:
 ; GFX906:  ; %bb.0:
-; GFX906:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX906:    v_dot2_f32_f16 v1, 2.0, v4, v5 op_sel_hi:[0,1,1]
-; GFX906:    v_add_f32_e32 v0, v0, v1
+; GFX906:    v_dot2_f32_f16 v2, v0, v1, v2
+; GFX906:    v_dot2_f32_f16 v5, 2.0, v4, v5 op_sel_hi:[0,1,1]
+; GFX906:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX950-LABEL: v_fdot2_inline_literal_a_y:
 ; GFX950:  ; %bb.0:
@@ -1329,22 +1317,10 @@ define float @v_fdot2_inline_literal_a_y(<2 x half> %a, 
<2 x half> %b, float %c,
 ; GFX10:    v_dot2c_f32_f16 v5, 0x40004000, v4
 ; GFX10:    v_add_f32_e32 v0, v2, v5
 ;
-; GFX11-LABEL: v_fdot2_inline_literal_a_y:
-; GFX11:  ; %bb.0:
-; GFX11:    v_dual_dot2acc_f32_f16 v2, v0, v1 :: v_dual_dot2acc_f32_f16 v5, 
0x40004000, v4
-; GFX11:    v_add_f32_e32 v0, v2, v5
-;
-; GFX1170-LABEL: v_fdot2_inline_literal_a_y:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX1170:    v_dot2_f32_f16 v1, 0x40004000, v4, v5
-; GFX1170:    v_add_f32_e32 v0, v0, v1
-;
-; GFX12-LABEL: v_fdot2_inline_literal_a_y:
-; GFX12:  ; %bb.0:
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX12:    v_dot2_f32_f16 v1, 0x40004000, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS-LABEL: v_fdot2_inline_literal_a_y:
+; GFX11PLUS:  ; %bb.0:
+; GFX11PLUS:    v_dual_dot2acc_f32_f16 v2, v0, v1 :: v_dual_dot2acc_f32_f16 
v5, 0x40004000, v4
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
   %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, 
i1 false)
   %r1 = call float @llvm.amdgcn.fdot2(<2 x half> <half 2.0, half 2.0>, <2 x 
half> %e, float %f, i1 false)
   %r = fadd float %r0, %r1
@@ -1354,9 +1330,9 @@ define float @v_fdot2_inline_literal_a_y(<2 x half> %a, 
<2 x half> %b, float %c,
 define float @v_fdot2_inline_literal_a_xy(<2 x half> %a, <2 x half> %b, float 
%c, <2 x half> %d, <2 x half> %e, float %f) {
 ; GFX906-LABEL: v_fdot2_inline_literal_a_xy:
 ; GFX906:  ; %bb.0:
-; GFX906:    v_dot2_f32_f16 v0, 2.0, v1, v2 op_sel_hi:[0,1,1]
-; GFX906:    v_dot2_f32_f16 v1, 2.0, v4, v5 op_sel_hi:[0,1,1]
-; GFX906:    v_add_f32_e32 v0, v0, v1
+; GFX906:    v_dot2_f32_f16 v2, 2.0, v1, v2 op_sel_hi:[0,1,1]
+; GFX906:    v_dot2_f32_f16 v5, 2.0, v4, v5 op_sel_hi:[0,1,1]
+; GFX906:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX950-LABEL: v_fdot2_inline_literal_a_xy:
 ; GFX950:  ; %bb.0:
@@ -1370,22 +1346,10 @@ define float @v_fdot2_inline_literal_a_xy(<2 x half> 
%a, <2 x half> %b, float %c
 ; GFX10:    v_dot2c_f32_f16 v5, 0x40004000, v4
 ; GFX10:    v_add_f32_e32 v0, v2, v5
 ;
-; GFX11-LABEL: v_fdot2_inline_literal_a_xy:
-; GFX11:  ; %bb.0:
-; GFX11:    v_dual_dot2acc_f32_f16 v2, 0x40004000, v1 :: 
v_dual_dot2acc_f32_f16 v5, 0x40004000, v4
-; GFX11:    v_add_f32_e32 v0, v2, v5
-;
-; GFX1170-LABEL: v_fdot2_inline_literal_a_xy:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_dot2_f32_f16 v0, 0x40004000, v1, v2
-; GFX1170:    v_dot2_f32_f16 v1, 0x40004000, v4, v5
-; GFX1170:    v_add_f32_e32 v0, v0, v1
-;
-; GFX12-LABEL: v_fdot2_inline_literal_a_xy:
-; GFX12:  ; %bb.0:
-; GFX12:    v_dot2_f32_f16 v0, 0x40004000, v1, v2
-; GFX12:    v_dot2_f32_f16 v1, 0x40004000, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS-LABEL: v_fdot2_inline_literal_a_xy:
+; GFX11PLUS:  ; %bb.0:
+; GFX11PLUS:    v_dual_dot2acc_f32_f16 v2, 0x40004000, v1 :: 
v_dual_dot2acc_f32_f16 v5, 0x40004000, v4
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
   %r0 = call float @llvm.amdgcn.fdot2(<2 x half> <half 2.0, half 2.0>, <2 x 
half> %b, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2(<2 x half> <half 2.0, half 2.0>, <2 x 
half> %e, float %f, i1 false)
   %r = fadd float %r0, %r1
@@ -1395,9 +1359,9 @@ define float @v_fdot2_inline_literal_a_xy(<2 x half> %a, 
<2 x half> %b, float %c
 define float @v_fdot2_inline_literal_b_x(<2 x half> %a, <2 x half> %b, float 
%c, <2 x half> %d, <2 x half> %e, float %f) {
 ; GFX906-LABEL: v_fdot2_inline_literal_b_x:
 ; GFX906:  ; %bb.0:
-; GFX906:    v_dot2_f32_f16 v0, v0, 2.0, v2 op_sel_hi:[1,0,1]
-; GFX906:    v_dot2_f32_f16 v1, v4, v3, v5
-; GFX906:    v_add_f32_e32 v0, v0, v1
+; GFX906:    v_dot2_f32_f16 v2, v0, 2.0, v2 op_sel_hi:[1,0,1]
+; GFX906:    v_dot2_f32_f16 v5, v4, v3, v5
+; GFX906:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX950-LABEL: v_fdot2_inline_literal_b_x:
 ; GFX950:  ; %bb.0:
@@ -1411,22 +1375,10 @@ define float @v_fdot2_inline_literal_b_x(<2 x half> %a, 
<2 x half> %b, float %c,
 ; GFX10:    v_dot2c_f32_f16 v5, v4, v3
 ; GFX10:    v_add_f32_e32 v0, v2, v5
 ;
-; GFX11-LABEL: v_fdot2_inline_literal_b_x:
-; GFX11:  ; %bb.0:
-; GFX11:    v_dual_dot2acc_f32_f16 v2, 0x40004000, v0 :: 
v_dual_dot2acc_f32_f16 v5, v4, v3
-; GFX11:    v_add_f32_e32 v0, v2, v5
-;
-; GFX1170-LABEL: v_fdot2_inline_literal_b_x:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_dot2_f32_f16 v0, v0, 0x40004000, v2
-; GFX1170:    v_dot2_f32_f16 v1, v4, v3, v5
-; GFX1170:    v_add_f32_e32 v0, v0, v1
-;
-; GFX12-LABEL: v_fdot2_inline_literal_b_x:
-; GFX12:  ; %bb.0:
-; GFX12:    v_dot2_f32_f16 v0, v0, 0x40004000, v2
-; GFX12:    v_dot2_f32_f16 v1, v4, v3, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS-LABEL: v_fdot2_inline_literal_b_x:
+; GFX11PLUS:  ; %bb.0:
+; GFX11PLUS:    v_dual_dot2acc_f32_f16 v2, 0x40004000, v0 :: 
v_dual_dot2acc_f32_f16 v5, v4, v3
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
   %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> <half 2.0, 
half 2.0>, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %e, <2 x half> %d, float %f, 
i1 false)
   %r = fadd float %r0, %r1
@@ -1436,9 +1388,9 @@ define float @v_fdot2_inline_literal_b_x(<2 x half> %a, 
<2 x half> %b, float %c,
 define float @v_fdot2_inline_literal_b_y(<2 x half> %a, <2 x half> %b, float 
%c, <2 x half> %d, <2 x half> %e, float %f) {
 ; GFX906-LABEL: v_fdot2_inline_literal_b_y:
 ; GFX906:  ; %bb.0:
-; GFX906:    v_dot2_f32_f16 v0, v1, v0, v2
-; GFX906:    v_dot2_f32_f16 v1, v3, 2.0, v5 op_sel_hi:[1,0,1]
-; GFX906:    v_add_f32_e32 v0, v0, v1
+; GFX906:    v_dot2_f32_f16 v2, v1, v0, v2
+; GFX906:    v_dot2_f32_f16 v5, v3, 2.0, v5 op_sel_hi:[1,0,1]
+; GFX906:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX950-LABEL: v_fdot2_inline_literal_b_y:
 ; GFX950:  ; %bb.0:
@@ -1452,22 +1404,10 @@ define float @v_fdot2_inline_literal_b_y(<2 x half> %a, 
<2 x half> %b, float %c,
 ; GFX10:    v_dot2c_f32_f16 v5, 0x40004000, v3
 ; GFX10:    v_add_f32_e32 v0, v2, v5
 ;
-; GFX11-LABEL: v_fdot2_inline_literal_b_y:
-; GFX11:  ; %bb.0:
-; GFX11:    v_dual_dot2acc_f32_f16 v2, v1, v0 :: v_dual_dot2acc_f32_f16 v5, 
0x40004000, v3
-; GFX11:    v_add_f32_e32 v0, v2, v5
-;
-; GFX1170-LABEL: v_fdot2_inline_literal_b_y:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_dot2_f32_f16 v0, v1, v0, v2
-; GFX1170:    v_dot2_f32_f16 v1, v3, 0x40004000, v5
-; GFX1170:    v_add_f32_e32 v0, v0, v1
-;
-; GFX12-LABEL: v_fdot2_inline_literal_b_y:
-; GFX12:  ; %bb.0:
-; GFX12:    v_dot2_f32_f16 v0, v1, v0, v2
-; GFX12:    v_dot2_f32_f16 v1, v3, 0x40004000, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS-LABEL: v_fdot2_inline_literal_b_y:
+; GFX11PLUS:  ; %bb.0:
+; GFX11PLUS:    v_dual_dot2acc_f32_f16 v2, v1, v0 :: v_dual_dot2acc_f32_f16 
v5, 0x40004000, v3
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
   %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %b, <2 x half> %a, float %c, 
i1 false)
   %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> <half 2.0, 
half 2.0>, float %f, i1 false)
   %r = fadd float %r0, %r1
@@ -1477,9 +1417,9 @@ define float @v_fdot2_inline_literal_b_y(<2 x half> %a, 
<2 x half> %b, float %c,
 define float @v_fdot2_inline_literal_b_xy(<2 x half> %a, <2 x half> %b, float 
%c, <2 x half> %d, <2 x half> %e, float %f) {
 ; GFX906-LABEL: v_fdot2_inline_literal_b_xy:
 ; GFX906:  ; %bb.0:
-; GFX906:    v_dot2_f32_f16 v0, v0, 2.0, v2 op_sel_hi:[1,0,1]
-; GFX906:    v_dot2_f32_f16 v1, v3, 2.0, v5 op_sel_hi:[1,0,1]
-; GFX906:    v_add_f32_e32 v0, v0, v1
+; GFX906:    v_dot2_f32_f16 v2, v0, 2.0, v2 op_sel_hi:[1,0,1]
+; GFX906:    v_dot2_f32_f16 v5, v3, 2.0, v5 op_sel_hi:[1,0,1]
+; GFX906:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX950-LABEL: v_fdot2_inline_literal_b_xy:
 ; GFX950:  ; %bb.0:
@@ -1493,22 +1433,10 @@ define float @v_fdot2_inline_literal_b_xy(<2 x half> 
%a, <2 x half> %b, float %c
 ; GFX10:    v_dot2c_f32_f16 v5, 0x40004000, v3
 ; GFX10:    v_add_f32_e32 v0, v2, v5
 ;
-; GFX11-LABEL: v_fdot2_inline_literal_b_xy:
-; GFX11:  ; %bb.0:
-; GFX11:    v_dual_dot2acc_f32_f16 v2, 0x40004000, v0 :: 
v_dual_dot2acc_f32_f16 v5, 0x40004000, v3
-; GFX11:    v_add_f32_e32 v0, v2, v5
-;
-; GFX1170-LABEL: v_fdot2_inline_literal_b_xy:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_dot2_f32_f16 v0, v0, 0x40004000, v2
-; GFX1170:    v_dot2_f32_f16 v1, v3, 0x40004000, v5
-; GFX1170:    v_add_f32_e32 v0, v0, v1
-;
-; GFX12-LABEL: v_fdot2_inline_literal_b_xy:
-; GFX12:  ; %bb.0:
-; GFX12:    v_dot2_f32_f16 v0, v0, 0x40004000, v2
-; GFX12:    v_dot2_f32_f16 v1, v3, 0x40004000, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS-LABEL: v_fdot2_inline_literal_b_xy:
+; GFX11PLUS:  ; %bb.0:
+; GFX11PLUS:    v_dual_dot2acc_f32_f16 v2, 0x40004000, v0 :: 
v_dual_dot2acc_f32_f16 v5, 0x40004000, v3
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
   %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> <half 2.0, 
half 2.0>, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> <half 2.0, 
half 2.0>, float %f, i1 false)
   %r = fadd float %r0, %r1
@@ -1519,8 +1447,8 @@ define float @v_fdot2_inline_literal_c_dual(<2 x half> 
%a, <2 x half> %b, <2 x h
 ; GFX906-LABEL: v_fdot2_inline_literal_c_dual:
 ; GFX906:  ; %bb.0:
 ; GFX906:    v_dot2_f32_f16 v0, v0, v1, 2.0
-; GFX906:    v_dot2_f32_f16 v1, v2, v3, v4
-; GFX906:    v_add_f32_e32 v0, v0, v1
+; GFX906:    v_dot2_f32_f16 v4, v2, v3, v4
+; GFX906:    v_add_f32_e32 v0, v0, v4
 ;
 ; GFX950-LABEL: v_fdot2_inline_literal_c_dual:
 ; GFX950:  ; %bb.0:
@@ -1542,17 +1470,11 @@ define float @v_fdot2_inline_literal_c_dual(<2 x half> 
%a, <2 x half> %b, <2 x h
 ; GFX11:    v_dot2acc_f32_f16 v5, v0, v1
 ; GFX11:    v_add_f32_e32 v0, v5, v4
 ;
-; GFX1170-LABEL: v_fdot2_inline_literal_c_dual:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, 2.0
-; GFX1170:    v_dot2_f32_f16 v1, v2, v3, v4
-; GFX1170:    v_add_f32_e32 v0, v0, v1
-;
-; GFX12-LABEL: v_fdot2_inline_literal_c_dual:
-; GFX12:  ; %bb.0:
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, 2.0
-; GFX12:    v_dot2_f32_f16 v1, v2, v3, v4
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX1170-GFX12-LABEL: v_fdot2_inline_literal_c_dual:
+; GFX1170-GFX12:  ; %bb.0:
+; GFX1170-GFX12:    v_dot2_f32_f16 v0, v0, v1, 2.0
+; GFX1170-GFX12:    v_dot2_f32_f16 v4, v2, v3, v4
+; GFX1170-GFX12:    v_add_f32_e32 v0, v0, v4
   %r0 = tail call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float 
2.0, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, 
i1 false)
   %r = fadd float %r0, %r1
@@ -1562,9 +1484,9 @@ define float @v_fdot2_inline_literal_c_dual(<2 x half> 
%a, <2 x half> %b, <2 x h
 define float @v_fdot2_clamp_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x 
half> %d, <2 x half> %e, float %f) {
 ; GCN-LABEL: v_fdot2_clamp_dual:
 ; GCN:  ; %bb.0:
-; GCN:    v_dot2_f32_f16 v0, v0, v1, v2 clamp
-; GCN:    v_dot2_f32_f16 v1, v3, v4, v5 clamp
-; GCN:    v_add_f32_e32 v0, v0, v1
+; GCN:    v_dot2_f32_f16 v2, v0, v1, v2 clamp
+; GCN:    v_dot2_f32_f16 v5, v3, v4, v5 clamp
+; GCN:    v_add_f32_e32 v0, v2, v5
   %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, 
i1 true)
   %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, 
i1 true)
   %r = fadd float %r0, %r1

_______________________________________________
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

Reply via email to