[llvm-branch-commits] [llvm] AMDGPU: Codegen for v_dual_dot2acc_f32_f16/bf16 from VOP3 (PR #179226)

Petar Avramovic via llvm-branch-commits Mon, 02 Feb 2026 05:02:21 -0800

https://github.com/petar-avramovic created 
https://github.com/llvm/llvm-project/pull/179226


Codegen for v_dual_dot2acc_f32_f16/bf16 for targets that only have VOP3
version of the instruction.
Since there is no VOP2 version, instroduce temporary mir DOT2ACC pseudo
that is selected when there are no src_modifiers. This DOT2ACC pseudo
has src2 tied to dst (like the VOP2 version), PostRA pseudo expansion will
restore pseudo to VOP3 version of the instruction.
CreateVOPD will recoginize such VOP3 pseudo and generate v_dual_dot2acc.

>From 0070a4ae320b5e90f2544d4a7dd5399a24e335a2 Mon Sep 17 00:00:00 2001
From: Petar Avramovic <[email protected]>
Date: Mon, 2 Feb 2026 13:25:21 +0100
Subject: [PATCH] AMDGPU: Codegen for v_dual_dot2acc_f32_f16/bf16 from VOP3

Codegen for v_dual_dot2acc_f32_f16/bf16 for targets that only have VOP3
version of the instruction.
Since there is no VOP2 version, instroduce temporary mir DOT2ACC pseudo
that is selected when there are no src_modifiers. This DOT2ACC pseudo
has src2 tied to dst (like the VOP2 version), PostRA pseudo expansion will
restore pseudo to VOP3 version of the instruction.
CreateVOPD will recoginize such VOP3 pseudo and generate v_dual_dot2acc.
---
 llvm/lib/Target/AMDGPU/AMDGPU.td              |   3 +
 llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp      |   5 +-
 llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp       |   2 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |   8 ++
 llvm/lib/Target/AMDGPU/SIInstrInfo.h          |  13 ++
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h |   8 +-
 llvm/lib/Target/AMDGPU/VOP3PInstructions.td   |  39 +++++-
 llvm/lib/Target/AMDGPU/VOPInstructions.td     |   4 +-
 .../AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll      | 118 +++++++++---------
 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll |  95 +++++++-------
 10 files changed, 182 insertions(+), 113 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 1a9bdb6634629..d006509b6aa6d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -740,6 +740,9 @@ defm Dot13Insts : AMDGPUSubtargetFeature<"dot13-insts",
   "Has v_dot2c_f32_bf16 instructions"
 >;
 
+def HasOnlyDualDot2AccF32F16 : Predicate<"Subtarget->hasVOPDInsts() && 
Subtarget->hasDot10Insts() && !Subtarget->hasDot5Insts()">;
+def HasOnlyDualDot2AccF32BF16 : Predicate<"Subtarget->hasVOPDInsts() && 
Subtarget->hasDot12Insts() && !Subtarget->hasDot13Insts()">;
+
 defm MAIInsts : AMDGPUSubtargetFeature<"mai-insts",
   "Has mAI instructions"
 >;
diff --git a/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp 
b/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp
index 72805aa9165b6..0118c2436d7a4 100644
--- a/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp
@@ -94,14 +94,15 @@ class GCNCreateVOPD {
     for (auto CompIdx : VOPD::COMPONENTS) {
       auto CompSrcOprNum = InstInfo[CompIdx].getCompSrcOperandsNum();
       bool IsVOP3 = SII->isVOP3(*MI[CompIdx]);
+      bool IsVOP3Dot = IsVOP3 && SII->isDOT(*MI[CompIdx]);
       for (unsigned CompSrcIdx = 0; CompSrcIdx < CompSrcOprNum; ++CompSrcIdx) {
         if (AMDGPU::hasNamedOperand(VOPDOpc, Mods[CompIdx][CompSrcIdx])) {
           const MachineOperand *Mod =
               SII->getNamedOperand(*MI[CompIdx], SrcMods[CompSrcIdx]);
           VOPDInst.addImm(Mod ? Mod->getImm() : 0);
         }
-        auto MCOprIdx =
-            InstInfo[CompIdx].getIndexOfSrcInMCOperands(CompSrcIdx, IsVOP3);
+        auto MCOprIdx = InstInfo[CompIdx].getIndexOfSrcInMCOperands(
+            CompSrcIdx, IsVOP3, IsVOP3Dot);
         VOPDInst.add(MI[CompIdx]->getOperand(MCOprIdx));
       }
       if (MI[CompIdx]->getOpcode() == AMDGPU::V_CNDMASK_B32_e32 && CI.IsVOPD3)
diff --git a/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp 
b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp
index 663f53889ac74..4300d5a3a8dd2 100644
--- a/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp
@@ -44,7 +44,7 @@ bool llvm::checkVOPDRegConstraints(const SIInstrInfo &TII,
 
   if (IsVOPD3 && !ST.hasVOPD3())
     return false;
-  if (!IsVOPD3 && (TII.isVOP3(MIX) || TII.isVOP3(MIY)))
+  if (!IsVOPD3 && (TII.isVOP3WithoutVOPD(MIX) || TII.isVOP3WithoutVOPD(MIY)))
     return false;
   if (TII.isDPP(MIX) || TII.isDPP(MIY))
     return false;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp 
b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 09efba485f6f8..684a0368fb292 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2065,6 +2065,14 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) 
const {
   const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
   switch (MI.getOpcode()) {
   default: return TargetInstrInfo::expandPostRAPseudo(MI);
+  case AMDGPU::V_DOT2ACC_F32_F16_PSEUDO:
+    MI.setDesc(get(AMDGPU::V_DOT2_F32_F16));
+    break;
+
+  case AMDGPU::V_DOT2ACC_F32_BF16_PSEUDO:
+    MI.setDesc(get(AMDGPU::V_DOT2_F32_BF16));
+    break;
+
   case AMDGPU::S_MOV_B64_term:
     // This is only a terminator to get the correct spill code placement during
     // register allocation.
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h 
b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 05cf804d08ffc..da0678644d787 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -559,6 +559,19 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
 
   static bool isVOP3(const MachineInstr &MI) { return isVOP3(MI.getDesc()); }
 
+  static bool isVOP3WithoutVOPD(const MachineInstr &MI) {
+    if (MI.getOpcode() == AMDGPU::V_DOT2_F32_F16 ||
+        MI.getOpcode() == AMDGPU::V_DOT2_F32_BF16) {
+      // VOPD if no src_mods, no clamp, no inline const and src2 same as dst.
+      return MI.getOperand(1).getImm() != 8 || !MI.getOperand(2).isReg() ||
+             MI.getOperand(3).getImm() != 8 || !MI.getOperand(4).isReg() ||
+             MI.getOperand(5).getImm() != 8 || !MI.getOperand(6).isReg() ||
+             MI.getOperand(6).getReg() != MI.getOperand(0).getReg() ||
+             MI.getOperand(7).getImm() != 0;
+    }
+    return isVOP3(MI.getDesc());
+  }
+
   bool isVOP3(uint16_t Opcode) const { return isVOP3(get(Opcode)); }
 
   static bool isSDWA(const MachineInstr &MI) {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h 
b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 0ecec79d08a38..9d2b1502d91e7 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -843,12 +843,18 @@ class ComponentLayout {
   unsigned getIndexOfDstInMCOperands() const { return MC_DST_IDX[Kind]; }
 
   // Return the index of the specified src operand in MCInst operands.
-  unsigned getIndexOfSrcInMCOperands(unsigned CompSrcIdx, bool VOPD3) const {
+  unsigned getIndexOfSrcInMCOperands(unsigned CompSrcIdx, bool VOPD3,
+                                     bool VOP3Dot = false) const {
     assert(CompSrcIdx < Component::MAX_SRC_NUM);
 
     if (Kind == SINGLE && CompSrcIdx == 2 && BitOp3Idx != -1)
       return BitOp3Idx;
 
+    if (VOP3Dot) {
+      return SINGLE_MC_SRC_IDX[3][CompSrcIdx] + getPrevCompSrcNum() +
+             (Kind != SINGLE ? 1 : 0);
+    }
+
     if (VOPD3) {
       return SINGLE_MC_SRC_IDX[VOPD3ModsNum][CompSrcIdx] + getPrevCompSrcNum() 
+
              getPrevCompVOPD3ModsNum() + (Kind != SINGLE ? 1 : 0);
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td 
b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 6554b6588ca2a..c773bd7273409 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -87,11 +87,13 @@ multiclass VOP3PInst<string OpName, VOPProfile P,
 }
 
 multiclass VOP3PInstDotWithDual<string OpName, VOPProfile P,
-                                SDPatternOperator node = null_frag> {
+                                SDPatternOperator node = null_frag,
+                                bits<6> VOPDOp, string VOPDName> {
   def NAME : VOP3P_Pseudo<OpName, P,
                           getVOP3PModPat<P, node,
                                          1 /*HasExplicitClamp*/, 1/*IsDOT*/,
-                                         VOP3PModsDOT, VOP3PModsF32>.ret>;
+                                         VOP3PModsDOT, VOP3PModsF32>.ret>,
+             VOPD_Component<VOPDOp, VOPDName>;
   let SubtargetPredicate = isGFX11Plus in {
   if P.HasExtVOP3DPP then
     def _dpp : VOP3_DPP_Pseudo<OpName, P> {
@@ -584,7 +586,7 @@ defm V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16",
 let OtherPredicates = [HasDot10Insts] in
 defm V_DOT2_F32_F16 : VOP3PInstDotWithDual<"v_dot2_f32_f16",
   VOP3P_Profile<VOP_F32_V2F16_V2F16_F32, VOP3_REGULAR, /*HasDPP*/ 1>,
-  AMDGPUfdot2>;
+  AMDGPUfdot2, 0xC, "v_dot2acc_f32_f16">;
 
 let OtherPredicates = [HasDot7Insts] in {
 defm V_DOT4_U32_U8  : VOP3PInst<"v_dot4_u32_u8",
@@ -608,12 +610,41 @@ def DOT2_BF16_Profile
 let SubtargetPredicate = HasDot12Insts  in {
 
 defm V_DOT2_F32_BF16 : VOP3PInstDotWithDual<"v_dot2_f32_bf16", 
DOT2_BF16_Profile,
-  int_amdgcn_fdot2_f32_bf16>;
+  int_amdgcn_fdot2_f32_bf16, 0xD, "v_dot2acc_f32_bf16">;
 
 } // End SubtargetPredicate = HasDot12Insts
 
 } // End let IsDOT = 1
 
+let IsDOT = 1, OtherPredicates = [HasOnlyDualDot2AccF32F16] in
+def V_DOT2ACC_F32_F16_PSEUDO : VOP3P_Pseudo<"", 
VOP3P_Profile<VOP_F32_V2F16_V2F16_F32, VOP3_REGULAR>> {
+  let Constraints = "$vdst = $src2";
+}
+
+let SubtargetPredicate = HasOnlyDualDot2AccF32F16 in
+def : GCNPat<
+  (f32 (AMDGPUfdot2 (v2f16 (VOP3PNoModsDOT v2f16:$src0)),
+                    (v2f16 (VOP3PNoModsDOT v2f16:$src1)),
+                    (f32 (VOP3PNoModsF32 f32:$src2)),
+                    (i1 DSTCLAMP.NONE))),
+  (f32 (V_DOT2ACC_F32_F16_PSEUDO (i32 8), $src0, (i32 8), $src1, (i32 8), 
$src2))
+>;
+
+let IsDOT = 1, OtherPredicates = [HasOnlyDualDot2AccF32BF16] in
+def V_DOT2ACC_F32_BF16_PSEUDO : VOP3P_Pseudo<"", DOT2_BF16_Profile> {
+  let Constraints = "$vdst = $src2";
+}
+
+let SubtargetPredicate = HasOnlyDualDot2AccF32BF16 in
+def : GCNPat<
+  (f32 (int_amdgcn_fdot2_f32_bf16 (v2bf16 (VOP3PNoModsDOT v2bf16:$src0)),
+                                  (v2bf16 (VOP3PNoModsDOT v2bf16:$src1)),
+                                  (f32 (VOP3PNoModsF32 f32:$src2)),
+                                  (i1 DSTCLAMP.NONE))),
+  (f32 (V_DOT2ACC_F32_BF16_PSEUDO (i32 8), $src0, (i32 8), $src1, (i32 8), 
$src2))
+>;
+
+
 multiclass VOP3PDOTIUInst <string OpName, SDPatternOperator intrinsic_node> {
   let IsDOT = 1 in
   defm NAME : VOP3PInst<OpName, VOP3P_Profile<VOP_I32_I32_I32_I32, 
VOP3_PACKED_NO_OPSEL>,
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td 
b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index c097e4088549d..a4d1416775f4d 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -34,8 +34,8 @@ class VOP <string opName> {
   string OpName = opName;
 }
 
-// First 13 insts from VOPDY are also VOPDX. DOT2ACC_F32_BF16 is omitted
-defvar VOPDX_Max_Index = 12;
+// First 13 insts from VOPDY are also VOPDX.
+defvar VOPDX_Max_Index = 13;
 defvar VOPD3X_Max_Index = 36;
 
 class VOPD_Component<bits<6> OpIn, string vOPDName> {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll 
b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
index 72d5d102cefea..01792c404d19d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
@@ -13,7 +13,8 @@ define float @v_fdot2_f32_bf16(<2 x bfloat> %a, <2 x bfloat> 
%b, float %c) {
 ;
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16:
 ; GFX11PLUS:  ; %bb.0:
-; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX11PLUS:    v_dot2_f32_bf16 v2, v0, v1, v2
+; GFX11PLUS:    v_mov_b32_e32 v0, v2
   %r = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> 
%b, float %c, i1 false)
   ret float %r
 }
@@ -77,12 +78,14 @@ define float @v_fdot2_f32_bf16_opsel_lo_a(<2 x bfloat> %a, 
<2 x bfloat> %b, floa
 ; GFX11-LABEL: v_fdot2_f32_bf16_opsel_lo_a:
 ; GFX11:  ; %bb.0:
 ; GFX11:    v_mov_b16_e32 v0.l, v0.h
-; GFX11:    v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX11:    v_dot2_f32_bf16 v2, v0, v1, v2
+; GFX11:    v_mov_b32_e32 v0, v2
 ;
 ; GFX12-LABEL: v_fdot2_f32_bf16_opsel_lo_a:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_perm_b32 v0, v0, v0, 0x7060302
-; GFX12:    v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX12:    v_dot2_f32_bf16 v2, v0, v1, v2
+; GFX12:    v_mov_b32_e32 v0, v2
   %shuf = shufflevector <2 x bfloat> %a, <2 x bfloat> poison, <2 x i32> <i32 
1, i32 1>
   %r = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %shuf, <2 x bfloat> 
%b, float %c, i1 false)
   ret float %r
@@ -99,12 +102,14 @@ define float @v_fdot2_f32_bf16_opsel_hi_a(<2 x bfloat> %a, 
<2 x bfloat> %b, floa
 ; GFX11-LABEL: v_fdot2_f32_bf16_opsel_hi_a:
 ; GFX11:  ; %bb.0:
 ; GFX11:    v_mov_b16_e32 v0.h, v0.l
-; GFX11:    v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX11:    v_dot2_f32_bf16 v2, v0, v1, v2
+; GFX11:    v_mov_b32_e32 v0, v2
 ;
 ; GFX12-LABEL: v_fdot2_f32_bf16_opsel_hi_a:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_perm_b32 v0, v0, v0, 0x5040100
-; GFX12:    v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX12:    v_dot2_f32_bf16 v2, v0, v1, v2
+; GFX12:    v_mov_b32_e32 v0, v2
   %shuf = shufflevector <2 x bfloat> %a, <2 x bfloat> poison, <2 x i32> <i32 
0, i32 0>
   %r = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %shuf, <2 x bfloat> 
%b, float %c, i1 false)
   ret float %r
@@ -121,12 +126,14 @@ define float @v_fdot2_f32_bf16_opsel_lo_b(<2 x bfloat> 
%a, <2 x bfloat> %b, floa
 ; GFX11-LABEL: v_fdot2_f32_bf16_opsel_lo_b:
 ; GFX11:  ; %bb.0:
 ; GFX11:    v_mov_b16_e32 v1.l, v1.h
-; GFX11:    v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX11:    v_dot2_f32_bf16 v2, v0, v1, v2
+; GFX11:    v_mov_b32_e32 v0, v2
 ;
 ; GFX12-LABEL: v_fdot2_f32_bf16_opsel_lo_b:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_perm_b32 v1, v1, v1, 0x7060302
-; GFX12:    v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX12:    v_dot2_f32_bf16 v2, v0, v1, v2
+; GFX12:    v_mov_b32_e32 v0, v2
   %shuf = shufflevector <2 x bfloat> %b, <2 x bfloat> poison, <2 x i32> <i32 
1, i32 1>
   %r = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> 
%shuf, float %c, i1 false)
   ret float %r
@@ -143,12 +150,14 @@ define float @v_fdot2_f32_bf16_opsel_hi_b(<2 x bfloat> 
%a, <2 x bfloat> %b, floa
 ; GFX11-LABEL: v_fdot2_f32_bf16_opsel_hi_b:
 ; GFX11:  ; %bb.0:
 ; GFX11:    v_mov_b16_e32 v1.h, v1.l
-; GFX11:    v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX11:    v_dot2_f32_bf16 v2, v0, v1, v2
+; GFX11:    v_mov_b32_e32 v0, v2
 ;
 ; GFX12-LABEL: v_fdot2_f32_bf16_opsel_hi_b:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_perm_b32 v1, v1, v1, 0x5040100
-; GFX12:    v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX12:    v_dot2_f32_bf16 v2, v0, v1, v2
+; GFX12:    v_mov_b32_e32 v0, v2
   %shuf = shufflevector <2 x bfloat> %b, <2 x bfloat> poison, <2 x i32> <i32 
0, i32 0>
   %r = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> 
%shuf, float %c, i1 false)
   ret float %r
@@ -162,7 +171,8 @@ define float @v_fdot2_f32_bf16_inline_literal_a(<2 x 
bfloat> %b, float %c) {
 ;
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_a:
 ; GFX11PLUS:  ; %bb.0:
-; GFX11PLUS:    v_dot2_f32_bf16 v0, 0x3f003f00, v0, v1
+; GFX11PLUS:    v_dot2_f32_bf16 v1, 0x3f003f00, v0, v1
+; GFX11PLUS:    v_mov_b32_e32 v0, v1
   %ret = tail call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> <bfloat 0.5, 
bfloat 0.5>, <2 x bfloat> %b, float %c, i1 false)
   ret float %ret
 }
@@ -175,7 +185,8 @@ define float @v_fdot2_f32_bf16_inline_literal_b(<2 x 
bfloat> %a, float %c) {
 ;
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_b:
 ; GFX11PLUS:  ; %bb.0:
-; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, 0x40004000, v1
+; GFX11PLUS:    v_dot2_f32_bf16 v1, v0, 0x40004000, v1
+; GFX11PLUS:    v_mov_b32_e32 v0, v1
   %ret = tail call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x 
bfloat> <bfloat 2.0, bfloat 2.0>, float %c, i1 false)
   ret float %ret
 }
@@ -189,7 +200,10 @@ define float @v_fdot2_f32_bf16_inline_literal_c(<2 x 
bfloat> %a, <2 x bfloat> %b
 ;
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_c:
 ; GFX11PLUS:  ; %bb.0:
-; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, v1, 2.0
+; GFX11PLUS:    s_mov_b32 s0, 2.0
+; GFX11PLUS:    v_mov_b32_e32 v2, s0
+; GFX11PLUS:    v_dot2_f32_bf16 v2, v0, v1, v2
+; GFX11PLUS:    v_mov_b32_e32 v0, v2
   %ret = tail call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x 
bfloat> %b, float 2.0, i1 false)
   ret float %ret
 }
@@ -375,9 +389,8 @@ define float @v_fdot2_f32_bf16_dual(<2 x bfloat> %a, <2 x 
bfloat> %b, float %c,
 ;
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_dual:
 ; GFX11PLUS:  ; %bb.0:
-; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, v1, v2
-; GFX11PLUS:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX11PLUS:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS:    v_dual_dot2acc_f32_bf16 v2, v0, v1 :: v_dual_dot2acc_f32_bf16 
v5, v3, v4
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
   %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> 
%b, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> 
%e, float %f, i1 false)
   %r = fadd float %r0, %r1
@@ -394,8 +407,8 @@ define float @v_fdot2_f32_bf16_neg_a_dual(<2 x bfloat> %a, 
<2 x bfloat> %b, floa
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_a_dual:
 ; GFX11PLUS:  ; %bb.0:
 ; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
-; GFX11PLUS:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX11PLUS:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS:    v_dot2_f32_bf16 v5, v3, v4, v5
+; GFX11PLUS:    v_add_f32_e32 v0, v0, v5
   %neg.a = fneg <2 x bfloat> %a
   %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %neg.a, <2 x 
bfloat> %b, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> 
%e, float %f, i1 false)
@@ -413,8 +426,8 @@ define float @v_fdot2_f32_bf16_neg_b_dual(<2 x bfloat> %a, 
<2 x bfloat> %b, floa
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_b_dual:
 ; GFX11PLUS:  ; %bb.0:
 ; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
-; GFX11PLUS:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX11PLUS:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS:    v_dot2_f32_bf16 v5, v3, v4, v5
+; GFX11PLUS:    v_add_f32_e32 v0, v0, v5
   %neg.b = fneg <2 x bfloat> %b
   %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> 
%neg.b, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> 
%e, float %f, i1 false)
@@ -433,8 +446,8 @@ define float @v_fdot2_f32_bf16_neg_c_dual(<2 x bfloat> %a, 
<2 x bfloat> %b, floa
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_c_dual:
 ; GFX11PLUS:  ; %bb.0:
 ; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,0,1]
-; GFX11PLUS:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX11PLUS:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS:    v_dot2_f32_bf16 v5, v3, v4, v5
+; GFX11PLUS:    v_add_f32_e32 v0, v0, v5
   %neg.c = fneg float %c
   %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> 
%b, float %neg.c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> 
%e, float %f, i1 false)
@@ -453,8 +466,8 @@ define float @v_fdot2_f32_bf16_abs_c_dual(<2 x bfloat> %a, 
<2 x bfloat> %b, floa
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_abs_c_dual:
 ; GFX11PLUS:  ; %bb.0:
 ; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, v1, v2 neg_hi:[0,0,1]
-; GFX11PLUS:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX11PLUS:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS:    v_dot2_f32_bf16 v5, v3, v4, v5
+; GFX11PLUS:    v_add_f32_e32 v0, v0, v5
   %abs.c = call float @llvm.fabs.f32(float %c)
   %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> 
%b, float %abs.c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> 
%e, float %f, i1 false)
@@ -474,16 +487,14 @@ define float @v_fdot2_f32_bf16_opsel_lo_a_dual(<2 x 
bfloat> %a, <2 x bfloat> %b,
 ; GFX11-LABEL: v_fdot2_f32_bf16_opsel_lo_a_dual:
 ; GFX11:  ; %bb.0:
 ; GFX11:    v_mov_b16_e32 v0.l, v0.h
-; GFX11:    v_dot2_f32_bf16 v0, v0, v1, v2
-; GFX11:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX11:    v_add_f32_e32 v0, v0, v1
+; GFX11:    v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, 
v0, v1
+; GFX11:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX12-LABEL: v_fdot2_f32_bf16_opsel_lo_a_dual:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_perm_b32 v0, v0, v0, 0x7060302
-; GFX12:    v_dot2_f32_bf16 v0, v0, v1, v2
-; GFX12:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX12:    v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, 
v0, v1
+; GFX12:    v_add_f32_e32 v0, v2, v5
   %shuf = shufflevector <2 x bfloat> %a, <2 x bfloat> poison, <2 x i32> <i32 
1, i32 1>
   %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %shuf, <2 x 
bfloat> %b, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> 
%e, float %f, i1 false)
@@ -503,16 +514,14 @@ define float @v_fdot2_f32_bf16_opsel_hi_a_dual(<2 x 
bfloat> %a, <2 x bfloat> %b,
 ; GFX11-LABEL: v_fdot2_f32_bf16_opsel_hi_a_dual:
 ; GFX11:  ; %bb.0:
 ; GFX11:    v_mov_b16_e32 v0.h, v0.l
-; GFX11:    v_dot2_f32_bf16 v0, v0, v1, v2
-; GFX11:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX11:    v_add_f32_e32 v0, v0, v1
+; GFX11:    v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, 
v0, v1
+; GFX11:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX12-LABEL: v_fdot2_f32_bf16_opsel_hi_a_dual:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_perm_b32 v0, v0, v0, 0x5040100
-; GFX12:    v_dot2_f32_bf16 v0, v0, v1, v2
-; GFX12:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX12:    v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, 
v0, v1
+; GFX12:    v_add_f32_e32 v0, v2, v5
   %shuf = shufflevector <2 x bfloat> %a, <2 x bfloat> poison, <2 x i32> <i32 
0, i32 0>
   %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %shuf, <2 x 
bfloat> %b, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> 
%e, float %f, i1 false)
@@ -532,16 +541,14 @@ define float @v_fdot2_f32_bf16_opsel_lo_b_dual(<2 x 
bfloat> %a, <2 x bfloat> %b,
 ; GFX11-LABEL: v_fdot2_f32_bf16_opsel_lo_b_dual:
 ; GFX11:  ; %bb.0:
 ; GFX11:    v_mov_b16_e32 v1.l, v1.h
-; GFX11:    v_dot2_f32_bf16 v0, v0, v1, v2
-; GFX11:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX11:    v_add_f32_e32 v0, v0, v1
+; GFX11:    v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, 
v0, v1
+; GFX11:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX12-LABEL: v_fdot2_f32_bf16_opsel_lo_b_dual:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_perm_b32 v1, v1, v1, 0x7060302
-; GFX12:    v_dot2_f32_bf16 v0, v0, v1, v2
-; GFX12:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX12:    v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, 
v0, v1
+; GFX12:    v_add_f32_e32 v0, v2, v5
   %shuf = shufflevector <2 x bfloat> %b, <2 x bfloat> poison, <2 x i32> <i32 
1, i32 1>
   %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> 
%shuf, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> 
%e, float %f, i1 false)
@@ -561,16 +568,14 @@ define float @v_fdot2_f32_bf16_opsel_hi_b_dual(<2 x 
bfloat> %a, <2 x bfloat> %b,
 ; GFX11-LABEL: v_fdot2_f32_bf16_opsel_hi_b_dual:
 ; GFX11:  ; %bb.0:
 ; GFX11:    v_mov_b16_e32 v1.h, v1.l
-; GFX11:    v_dot2_f32_bf16 v0, v0, v1, v2
-; GFX11:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX11:    v_add_f32_e32 v0, v0, v1
+; GFX11:    v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, 
v0, v1
+; GFX11:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX12-LABEL: v_fdot2_f32_bf16_opsel_hi_b_dual:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_perm_b32 v1, v1, v1, 0x5040100
-; GFX12:    v_dot2_f32_bf16 v0, v0, v1, v2
-; GFX12:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX12:    v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, 
v0, v1
+; GFX12:    v_add_f32_e32 v0, v2, v5
   %shuf = shufflevector <2 x bfloat> %b, <2 x bfloat> poison, <2 x i32> <i32 
0, i32 0>
   %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> 
%shuf, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> 
%e, float %f, i1 false)
@@ -587,9 +592,9 @@ define float @v_fdot2_f32_bf16_inline_literal_a_dual(<2 x 
bfloat> %b, float %c,
 ;
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_a_dual:
 ; GFX11PLUS:  ; %bb.0:
-; GFX11PLUS:    v_dot2_f32_bf16 v0, 0x40004000, v0, v1
-; GFX11PLUS:    v_dot2_f32_bf16 v1, v2, v3, v4
-; GFX11PLUS:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS:    v_dot2_f32_bf16 v1, 0x40004000, v0, v1
+; GFX11PLUS:    v_dot2_f32_bf16 v4, v2, v3, v4
+; GFX11PLUS:    v_add_f32_e32 v0, v1, v4
   %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> <bfloat 2.0, 
bfloat 2.0>, <2 x bfloat> %b, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> 
%e, float %f, i1 false)
   %r = fadd float %r0, %r1
@@ -605,9 +610,9 @@ define float @v_fdot2_f32_bf16_inline_literal_b_dual(<2 x 
bfloat> %a, float %c,
 ;
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_b_dual:
 ; GFX11PLUS:  ; %bb.0:
-; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, 0x40004000, v1
-; GFX11PLUS:    v_dot2_f32_bf16 v1, v2, v3, v4
-; GFX11PLUS:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS:    v_dot2_f32_bf16 v1, v0, 0x40004000, v1
+; GFX11PLUS:    v_dot2_f32_bf16 v4, v2, v3, v4
+; GFX11PLUS:    v_add_f32_e32 v0, v1, v4
   %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> 
<bfloat 2.0, bfloat 2.0>, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> 
%e, float %f, i1 false)
   %r = fadd float %r0, %r1
@@ -624,9 +629,10 @@ define float @v_fdot2_f32_bf16_inline_literal_c_dual(<2 x 
bfloat> %a, <2 x bfloa
 ;
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_c_dual:
 ; GFX11PLUS:  ; %bb.0:
-; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, v1, 2.0
-; GFX11PLUS:    v_dot2_f32_bf16 v1, v2, v3, v4
-; GFX11PLUS:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS:    s_mov_b32 s0, 2.0
+; GFX11PLUS:    v_dual_dot2acc_f32_bf16 v4, v2, v3 :: v_dual_mov_b32 v5, s0
+; GFX11PLUS:    v_dot2_f32_bf16 v5, v0, v1, v5
+; GFX11PLUS:    v_add_f32_e32 v0, v5, v4
   %r0 = tail call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x 
bfloat> %b, float 2.0, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> 
%e, float %f, i1 false)
   %r = fadd float %r0, %r1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll 
b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll
index de0722681df28..4d05181c2d8ba 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll
@@ -29,7 +29,8 @@ define float @v_fdot2(<2 x half> %a, <2 x half> %b, float %c) 
{
 ;
 ; GFX12-LABEL: v_fdot2:
 ; GFX12:  ; %bb.0:
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
+; GFX12:    v_dot2_f32_f16 v2, v0, v1, v2
+; GFX12:    v_mov_b32_e32 v0, v2
   %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, 
i1 false)
   ret float %r
 }
@@ -129,7 +130,8 @@ define float @v_fdot2_opsel_lo_a(<2 x half> %a, <2 x half> 
%b, float %c) {
 ; GFX12-LABEL: v_fdot2_opsel_lo_a:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_perm_b32 v0, v0, v0, 0x7060302
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
+; GFX12:    v_dot2_f32_f16 v2, v0, v1, v2
+; GFX12:    v_mov_b32_e32 v0, v2
   %shuf = shufflevector <2 x half> %a, <2 x half> poison, <2 x i32> <i32 1, 
i32 1>
   %r = call float @llvm.amdgcn.fdot2(<2 x half> %shuf, <2 x half> %b, float 
%c, i1 false)
   ret float %r
@@ -160,7 +162,8 @@ define float @v_fdot2_opsel_hi_a(<2 x half> %a, <2 x half> 
%b, float %c) {
 ; GFX12-LABEL: v_fdot2_opsel_hi_a:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_perm_b32 v0, v0, v0, 0x5040100
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
+; GFX12:    v_dot2_f32_f16 v2, v0, v1, v2
+; GFX12:    v_mov_b32_e32 v0, v2
   %shuf = shufflevector <2 x half> %a, <2 x half> poison, <2 x i32> <i32 0, 
i32 0>
   %r = call float @llvm.amdgcn.fdot2(<2 x half> %shuf, <2 x half> %b, float 
%c, i1 false)
   ret float %r
@@ -191,7 +194,8 @@ define float @v_fdot2_opsel_lo_b(<2 x half> %a, <2 x half> 
%b, float %c) {
 ; GFX12-LABEL: v_fdot2_opsel_lo_b:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_perm_b32 v1, v1, v1, 0x7060302
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
+; GFX12:    v_dot2_f32_f16 v2, v0, v1, v2
+; GFX12:    v_mov_b32_e32 v0, v2
   %shuf = shufflevector <2 x half> %b, <2 x half> poison, <2 x i32> <i32 1, 
i32 1>
   %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %shuf, float 
%c, i1 false)
   ret float %r
@@ -222,7 +226,8 @@ define float @v_fdot2_opsel_hi_b(<2 x half> %a, <2 x half> 
%b, float %c) {
 ; GFX12-LABEL: v_fdot2_opsel_hi_b:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_perm_b32 v1, v1, v1, 0x5040100
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
+; GFX12:    v_dot2_f32_f16 v2, v0, v1, v2
+; GFX12:    v_mov_b32_e32 v0, v2
   %shuf = shufflevector <2 x half> %b, <2 x half> poison, <2 x i32> <i32 0, 
i32 0>
   %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %shuf, float 
%c, i1 false)
   ret float %r
@@ -250,7 +255,8 @@ define float @v_fdot2_inline_literal_a(<2 x half> %b, float 
%c) {
 ;
 ; GFX12-LABEL: v_fdot2_inline_literal_a:
 ; GFX12:  ; %bb.0:
-; GFX12:    v_dot2_f32_f16 v0, 0x40004000, v0, v1
+; GFX12:    v_dot2_f32_f16 v1, 0x40004000, v0, v1
+; GFX12:    v_mov_b32_e32 v0, v1
   %ret = tail call float @llvm.amdgcn.fdot2(<2 x half> <half 2.0, half 2.0>, 
<2 x half> %b, float %c, i1 false)
   ret float %ret
 }
@@ -277,7 +283,8 @@ define float @v_fdot2_inline_literal_b(<2 x half> %a, float 
%c) {
 ;
 ; GFX12-LABEL: v_fdot2_inline_literal_b:
 ; GFX12:  ; %bb.0:
-; GFX12:    v_dot2_f32_f16 v0, v0, 0x40004000, v1
+; GFX12:    v_dot2_f32_f16 v1, v0, 0x40004000, v1
+; GFX12:    v_mov_b32_e32 v0, v1
   %ret = tail call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> <half 
2.0, half 2.0>, float %c, i1 false)
   ret float %ret
 }
@@ -307,7 +314,10 @@ define float @v_fdot2_inline_literal_c(<2 x half> %a, <2 x 
half> %b) {
 ;
 ; GFX12-LABEL: v_fdot2_inline_literal_c:
 ; GFX12:  ; %bb.0:
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, 2.0
+; GFX12:    s_mov_b32 s0, 2.0
+; GFX12:    v_mov_b32_e32 v2, s0
+; GFX12:    v_dot2_f32_f16 v2, v0, v1, v2
+; GFX12:    v_mov_b32_e32 v0, v2
   %ret = tail call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, 
float 2.0, i1 false)
   ret float %ret
 }
@@ -571,16 +581,10 @@ define float @v_fdot2_dual(<2 x half> %a, <2 x half> %b, 
float %c, <2 x half> %d
 ; GFX10:    v_dot2c_f32_f16 v5, v3, v4
 ; GFX10:    v_add_f32_e32 v0, v2, v5
 ;
-; GFX11-LABEL: v_fdot2_dual:
-; GFX11:  ; %bb.0:
-; GFX11:    v_dual_dot2acc_f32_f16 v2, v0, v1 :: v_dual_dot2acc_f32_f16 v5, 
v3, v4
-; GFX11:    v_add_f32_e32 v0, v2, v5
-;
-; GFX12-LABEL: v_fdot2_dual:
-; GFX12:  ; %bb.0:
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX12:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS-LABEL: v_fdot2_dual:
+; GFX11PLUS:  ; %bb.0:
+; GFX11PLUS:    v_dual_dot2acc_f32_f16 v2, v0, v1 :: v_dual_dot2acc_f32_f16 
v5, v3, v4
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
   %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, 
i1 false)
   %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, 
i1 false)
   %r = fadd float %r0, %r1
@@ -615,8 +619,8 @@ define float @v_fdot2_neg_a_dual(<2 x half> %a, <2 x half> 
%b, float %c, <2 x ha
 ; GFX12-LABEL: v_fdot2_neg_a_dual:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
-; GFX12:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX12:    v_dot2_f32_f16 v5, v3, v4, v5
+; GFX12:    v_add_f32_e32 v0, v0, v5
   %neg.a = fneg <2 x half> %a
   %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %neg.a, <2 x half> %b, float 
%c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, 
i1 false)
@@ -652,8 +656,8 @@ define float @v_fdot2_neg_b_dual(<2 x half> %a, <2 x half> 
%b, float %c, <2 x ha
 ; GFX12-LABEL: v_fdot2_neg_b_dual:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
-; GFX12:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX12:    v_dot2_f32_f16 v5, v3, v4, v5
+; GFX12:    v_add_f32_e32 v0, v0, v5
   %neg.b = fneg <2 x half> %b
   %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %neg.b, float 
%c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, 
i1 false)
@@ -692,8 +696,8 @@ define float @v_fdot2_neg_c_dual(<2 x half> %a, <2 x half> 
%b, float %c, <2 x ha
 ; GFX12-LABEL: v_fdot2_neg_c_dual:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1]
-; GFX12:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX12:    v_dot2_f32_f16 v5, v3, v4, v5
+; GFX12:    v_add_f32_e32 v0, v0, v5
   %neg.c = fneg float %c
   %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float 
%neg.c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, 
i1 false)
@@ -732,8 +736,8 @@ define float @v_fdot2_abs_c_dual(<2 x half> %a, <2 x half> 
%b, float %c, <2 x ha
 ; GFX12-LABEL: v_fdot2_abs_c_dual:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,0,1]
-; GFX12:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX12:    v_dot2_f32_f16 v5, v3, v4, v5
+; GFX12:    v_add_f32_e32 v0, v0, v5
   %abs.c = call float @llvm.fabs.f32(float %c)
   %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float 
%abs.c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, 
i1 false)
@@ -771,9 +775,8 @@ define float @v_fdot2_opsel_lo_a_dual(<2 x half> %a, <2 x 
half> %b, float %c, <2
 ; GFX12-LABEL: v_fdot2_opsel_lo_a_dual:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_perm_b32 v0, v0, v0, 0x7060302
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX12:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX12:    v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, 
v0, v1
+; GFX12:    v_add_f32_e32 v0, v2, v5
   %shuf = shufflevector <2 x half> %a, <2 x half> poison, <2 x i32> <i32 1, 
i32 1>
   %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %shuf, <2 x half> %b, float 
%c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, 
i1 false)
@@ -811,9 +814,8 @@ define float @v_fdot2_opsel_hi_a_dual(<2 x half> %a, <2 x 
half> %b, float %c, <2
 ; GFX12-LABEL: v_fdot2_opsel_hi_a_dual:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_perm_b32 v0, v0, v0, 0x5040100
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX12:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX12:    v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, 
v0, v1
+; GFX12:    v_add_f32_e32 v0, v2, v5
   %shuf = shufflevector <2 x half> %a, <2 x half> poison, <2 x i32> <i32 0, 
i32 0>
   %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %shuf, <2 x half> %b, float 
%c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, 
i1 false)
@@ -851,9 +853,8 @@ define float @v_fdot2_opsel_lo_b_dual(<2 x half> %a, <2 x 
half> %b, float %c, <2
 ; GFX12-LABEL: v_fdot2_opsel_lo_b_dual:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_perm_b32 v1, v1, v1, 0x7060302
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX12:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX12:    v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, 
v0, v1
+; GFX12:    v_add_f32_e32 v0, v2, v5
   %shuf = shufflevector <2 x half> %b, <2 x half> poison, <2 x i32> <i32 1, 
i32 1>
   %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %shuf, float 
%c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, 
i1 false)
@@ -891,9 +892,8 @@ define float @v_fdot2_opsel_hi_b_dual(<2 x half> %a, <2 x 
half> %b, float %c, <2
 ; GFX12-LABEL: v_fdot2_opsel_hi_b_dual:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_perm_b32 v1, v1, v1, 0x5040100
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX12:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX12:    v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, 
v0, v1
+; GFX12:    v_add_f32_e32 v0, v2, v5
   %shuf = shufflevector <2 x half> %b, <2 x half> poison, <2 x i32> <i32 0, 
i32 0>
   %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %shuf, float 
%c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, 
i1 false)
@@ -927,9 +927,9 @@ define float @v_fdot2_inline_literal_a_dual(<2 x half> %b, 
float %c, <2 x half>
 ;
 ; GFX12-LABEL: v_fdot2_inline_literal_a_dual:
 ; GFX12:  ; %bb.0:
-; GFX12:    v_dot2_f32_f16 v0, 0x40004000, v0, v1
-; GFX12:    v_dot2_f32_f16 v1, v2, v3, v4
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX12:    v_dot2_f32_f16 v1, 0x40004000, v0, v1
+; GFX12:    v_dot2_f32_f16 v4, v2, v3, v4
+; GFX12:    v_add_f32_e32 v0, v1, v4
   %r0 = call float @llvm.amdgcn.fdot2(<2 x half> <half 2.0, half 2.0>, <2 x 
half> %b, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, 
i1 false)
   %r = fadd float %r0, %r1
@@ -962,9 +962,9 @@ define float @v_fdot2_inline_literal_b_dual(<2 x half> %a, 
float %c, <2 x half>
 ;
 ; GFX12-LABEL: v_fdot2_inline_literal_b_dual:
 ; GFX12:  ; %bb.0:
-; GFX12:    v_dot2_f32_f16 v0, v0, 0x40004000, v1
-; GFX12:    v_dot2_f32_f16 v1, v2, v3, v4
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX12:    v_dot2_f32_f16 v1, v0, 0x40004000, v1
+; GFX12:    v_dot2_f32_f16 v4, v2, v3, v4
+; GFX12:    v_add_f32_e32 v0, v1, v4
   %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> <half 2.0, 
half 2.0>, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, 
i1 false)
   %r = fadd float %r0, %r1
@@ -1000,9 +1000,10 @@ define float @v_fdot2_inline_literal_c_dual(<2 x half> 
%a, <2 x half> %b, <2 x h
 ;
 ; GFX12-LABEL: v_fdot2_inline_literal_c_dual:
 ; GFX12:  ; %bb.0:
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, 2.0
-; GFX12:    v_dot2_f32_f16 v1, v2, v3, v4
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX12:    s_mov_b32 s0, 2.0
+; GFX12:    v_dual_dot2acc_f32_f16 v4, v2, v3 :: v_dual_mov_b32 v5, s0
+; GFX12:    v_dot2_f32_f16 v5, v0, v1, v5
+; GFX12:    v_add_f32_e32 v0, v5, v4
   %r0 = tail call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float 
2.0, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, 
i1 false)
   %r = fadd float %r0, %r1

_______________________________________________
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [llvm] AMDGPU: Codegen for v_dual_dot2acc_f32_f16/bf16 from VOP3 (PR #179226)

Reply via email to