https://github.com/shiltian updated 
https://github.com/llvm/llvm-project/pull/174310

>From 2556f42e1cc25d04a55825410b6a2d7d0e9528e0 Mon Sep 17 00:00:00 2001
From: Shilei Tian <[email protected]>
Date: Sat, 3 Jan 2026 19:42:19 -0500
Subject: [PATCH] [AMDGPU] Rework the clamp support for WMMA instructions

---
 clang/include/clang/Basic/BuiltinsAMDGPU.def  |  4 +-
 clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp   | 14 +++++
 clang/lib/Sema/SemaAMDGPU.cpp                 | 34 +++++++++++
 .../builtins-amdgcn-gfx1250-wmma-w32.cl       | 26 ++++++++-
 ...ins-amdgcn-error-gfx1250-wmma-w32-param.cl |  6 ++
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td      | 27 +++++++--
 llvm/lib/IR/AutoUpgrade.cpp                   | 57 +++++++++++++++++++
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     | 12 ++--
 llvm/lib/Target/AMDGPU/VOP3PInstructions.td   | 18 +++---
 .../UniformityAnalysis/AMDGPU/intrinsics.ll   | 12 ++--
 .../test/Bitcode/amdgpu-wmma-clamp-upgrade.ll | 25 ++++++++
 .../AMDGPU/wmma-coececution-valu-hazards.mir  | 52 ++++++++---------
 .../AMDGPU/wmma-hazards-gfx1250-w32.mir       | 40 ++++++-------
 llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32.s    | 10 ++++
 .../test/MC/AMDGPU/gfx1250_asm_wmma_w32_err.s |  6 --
 .../AMDGPU/gfx1250_dasm_wmma_w32.txt          |  6 ++
 mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td  |  9 +--
 mlir/test/Target/LLVMIR/rocdl.mlir            | 12 ++--
 18 files changed, 280 insertions(+), 90 deletions(-)
 create mode 100644 llvm/test/Bitcode/amdgpu-wmma-clamp-upgrade.ll

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 24b79c3b69b67..7faf73b7628fe 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -857,7 +857,7 @@ TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x64_fp8_fp8, 
"V8hV8iV8iIsV8hIbIb",
 TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x64_fp8_bf8, 
"V8hV8iV8iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x64_bf8_fp8, 
"V8hV8iV8iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x64_bf8_bf8, 
"V8hV8iV8iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
-TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x64_iu8, "V8iIbV8iIbV8iV8iIbIb", 
"nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x64_iu8, 
"V8iIbV8iIbV8iV8iIbIb.", "nc", "gfx1250-insts,wavefrontsize32")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x128_fp8_fp8, 
"V8hV16iV16iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x128_fp8_bf8, 
"V8hV16iV16iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x128_bf8_fp8, 
"V8hV16iV16iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
@@ -885,7 +885,7 @@ 
TARGET_BUILTIN(__builtin_amdgcn_swmmac_f16_16x16x128_fp8_fp8, "V8hV8iV16iV8hiIbI
 TARGET_BUILTIN(__builtin_amdgcn_swmmac_f16_16x16x128_fp8_bf8, 
"V8hV8iV16iV8hiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
 TARGET_BUILTIN(__builtin_amdgcn_swmmac_f16_16x16x128_bf8_fp8, 
"V8hV8iV16iV8hiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
 TARGET_BUILTIN(__builtin_amdgcn_swmmac_f16_16x16x128_bf8_bf8, 
"V8hV8iV16iV8hiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
-TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x128_iu8, 
"V8iIbV8iIbV16iV8iiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x128_iu8, 
"V8iIbV8iIbV16iV8iiIbIb.", "nc", "gfx1250-insts,wavefrontsize32")
 TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x64_f16, 
"V8fIbV16hIbV32hV8fiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
 TARGET_BUILTIN(__builtin_amdgcn_swmmac_f16_16x16x64_f16, 
"V8hIbV16hIbV32hV8hiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
 
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp 
b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index eabdc370da6b4..a8a5bc348f00c 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -1665,6 +1665,20 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
     if (AppendFalseForOpselArg)
       Args.push_back(Builder.getFalse());
 
+    // Handle the optional clamp argument of the following two builtins.
+    if (BuiltinID == AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x64_iu8) {
+      if (Args.size() == 7)
+        Args.push_back(Builder.getFalse());
+      assert(Args.size() == 8 && "Expected 8 arguments");
+      Args[7] = Builder.CreateZExtOrTrunc(Args[7], Builder.getInt1Ty());
+    } else if (BuiltinID ==
+               AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x128_iu8) {
+      if (Args.size() == 8)
+        Args.push_back(Builder.getFalse());
+      assert(Args.size() == 9 && "Expected 9 arguments");
+      Args[8] = Builder.CreateZExtOrTrunc(Args[8], Builder.getInt1Ty());
+    }
+
     SmallVector<llvm::Type *, 6> ArgTypes;
     if (NeedReturnType)
       ArgTypes.push_back(ConvertType(E->getType()));
diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp
index cece22092bb14..9d154c65c932e 100644
--- a/clang/lib/Sema/SemaAMDGPU.cpp
+++ b/clang/lib/Sema/SemaAMDGPU.cpp
@@ -255,6 +255,40 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned 
BuiltinID,
            (SemaRef.BuiltinConstantArg(TheCall, ArgCount, Result)) ||
            (SemaRef.BuiltinConstantArg(TheCall, (ArgCount - 1), Result));
   }
+  case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x64_iu8:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x128_iu8: {
+    if (BuiltinID == AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x64_iu8) {
+      if (SemaRef.checkArgCountRange(TheCall, 7, 8))
+        return true;
+      if (TheCall->getNumArgs() == 7)
+        return false;
+    } else if (BuiltinID ==
+               AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x128_iu8) {
+      if (SemaRef.checkArgCountRange(TheCall, 8, 9))
+        return true;
+      if (TheCall->getNumArgs() == 8)
+        return false;
+    }
+    // Check if the last argument (clamp operand) is a constant and is
+    // convertible to bool.
+    Expr *ClampArg = TheCall->getArg(TheCall->getNumArgs() - 1);
+    // 1) Ensure clamp argument is a constant expression
+    llvm::APSInt ClampValue;
+    if (!SemaRef.VerifyIntegerConstantExpression(ClampArg, &ClampValue)
+             .isUsable())
+      return true;
+    // 2) Check if the argument can be converted to bool type
+    if (!SemaRef.Context.hasSameType(ClampArg->getType(),
+                                     SemaRef.Context.BoolTy)) {
+      // Try to convert to bool
+      QualType BoolTy = SemaRef.Context.BoolTy;
+      ExprResult ClampExpr(ClampArg);
+      SemaRef.CheckSingleAssignmentConstraints(BoolTy, ClampExpr);
+      if (ClampExpr.isInvalid())
+        return true;
+    }
+    return false;
+  }
   default:
     return false;
   }
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-wmma-w32.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-wmma-w32.cl
index afad4bb15b528..a463ba7ab41c3 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-wmma-w32.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-wmma-w32.cl
@@ -148,7 +148,7 @@ void test_amdgcn_wmma_f16_16x16x64_bf8_bf8(global v8h* out, 
v8i a, v8i b, v8h c)
 
 // CHECK-GFX1250-LABEL: @test_amdgcn_wmma_i32_16x16x64_iu8(
 // CHECK-GFX1250-NEXT:  entry:
-// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> 
@llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 false, <8 x i32> [[A:%.*]], 
i1 false, <8 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> 
@llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 false, <8 x i32> [[A:%.*]], 
i1 false, <8 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false, i1 true, i1 false)
 // CHECK-GFX1250-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) 
[[OUT:%.*]], align 32, !tbaa [[TBAA8]]
 // CHECK-GFX1250-NEXT:    ret void
 //
@@ -157,6 +157,17 @@ void test_amdgcn_wmma_i32_16x16x64_iu8(global v8i* out, 
v8i a, v8i b, v8i c)
   *out = __builtin_amdgcn_wmma_i32_16x16x64_iu8(0, a, 0, b, c, false, true);
 }
 
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_i32_16x16x64_iu8_clamp(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> 
@llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 false, <8 x i32> [[A:%.*]], 
i1 false, <8 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false, i1 true, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) 
[[OUT:%.*]], align 32, !tbaa [[TBAA8]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_wmma_i32_16x16x64_iu8_clamp(global v8i* out, v8i a, v8i b, 
v8i c)
+{
+  *out = __builtin_amdgcn_wmma_i32_16x16x64_iu8(0, a, 0, b, c, false, true, 1);
+}
+
 // CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f32_16x16x128_f8f6f4(
 // CHECK-GFX1250-NEXT:  entry:
 // CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i32> [[B:%.*]], 
<16 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, 
i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -459,7 +470,7 @@ void test_amdgcn_swmmac_f16_16x16x128_bf8_bf8(global v8h* 
out, v8i a, v16i b, v8
 
 // CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_i32_16x16x128_iu8(
 // CHECK-GFX1250-NEXT:  entry:
-// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> 
@llvm.amdgcn.swmmac.i32.16x16x128.iu8.v8i32.v8i32.v16i32.i32(i1 false, <8 x 
i32> [[A:%.*]], i1 false, <16 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i32 
[[INDEX:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> 
@llvm.amdgcn.swmmac.i32.16x16x128.iu8.v8i32.v8i32.v16i32.i32(i1 false, <8 x 
i32> [[A:%.*]], i1 false, <16 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i32 
[[INDEX:%.*]], i1 false, i1 true, i1 false)
 // CHECK-GFX1250-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) 
[[OUT:%.*]], align 32, !tbaa [[TBAA8]]
 // CHECK-GFX1250-NEXT:    ret void
 //
@@ -468,6 +479,17 @@ void test_amdgcn_swmmac_i32_16x16x128_iu8(global v8i* out, 
v8i a, v16i b, v8i c,
   *out = __builtin_amdgcn_swmmac_i32_16x16x128_iu8(0, a, 0, b, c, index, 
false, true);
 }
 
+// CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_i32_16x16x128_iu8_clamp(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> 
@llvm.amdgcn.swmmac.i32.16x16x128.iu8.v8i32.v8i32.v16i32.i32(i1 false, <8 x 
i32> [[A:%.*]], i1 false, <16 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i32 
[[INDEX:%.*]], i1 false, i1 true, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) 
[[OUT:%.*]], align 32, !tbaa [[TBAA8]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_swmmac_i32_16x16x128_iu8_clamp(global v8i* out, v8i a, v16i 
b, v8i c, int index)
+{
+  *out = __builtin_amdgcn_swmmac_i32_16x16x128_iu8(0, a, 0, b, c, index, 
false, true, 1);
+}
+
 // CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_f32_16x16x64_f16(
 // CHECK-GFX1250-NEXT:  entry:
 // CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x float> 
@llvm.amdgcn.swmmac.f32.16x16x64.f16.v8f32.v16f16.v32f16.i32(i1 false, <16 x 
half> [[A:%.*]], i1 false, <32 x half> [[B:%.*]], <8 x float> [[C:%.*]], i32 
[[INDEX:%.*]], i1 false, i1 true)
diff --git 
a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-wmma-w32-param.cl 
b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-wmma-w32-param.cl
index 49ef2e571740c..e3e0cc3f596c7 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-wmma-w32-param.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-wmma-w32-param.cl
@@ -112,6 +112,9 @@ void test_amdgcn_wmma_i32_16x16x64_iu8(global v8i* out, v8i 
a, v8i b, v8i c, int
   *out = __builtin_amdgcn_wmma_i32_16x16x64_iu8(0, a, mod, b, c, false, 
false); // expected-error {{'__builtin_amdgcn_wmma_i32_16x16x64_iu8' must be a 
constant integer}}
   *out = __builtin_amdgcn_wmma_i32_16x16x64_iu8(0, a, 0, b, c, mod, false); // 
expected-error {{'__builtin_amdgcn_wmma_i32_16x16x64_iu8' must be a constant 
integer}}
   *out = __builtin_amdgcn_wmma_i32_16x16x64_iu8(0, a, 0, b, c, false, mod); // 
expected-error {{'__builtin_amdgcn_wmma_i32_16x16x64_iu8' must be a constant 
integer}}
+  *out = __builtin_amdgcn_wmma_i32_16x16x64_iu8(0, a, 0, b, c, false, true, 
32.0f); // expected-error {{integer constant expression must have integer type, 
not 'double'}}
+  *out = __builtin_amdgcn_wmma_i32_16x16x64_iu8(0, a, 0, b, c, false, true, 
mod); // expected-error {{expression is not an integer constant expression}}
+  *out = __builtin_amdgcn_wmma_i32_16x16x64_iu8(0, a, 0, b, c, false, true, 
true, 32.0f); // expected-error {{too many arguments to function call, expected 
at most 8, have 9}}
 }
 
 void test_amdgcn_wmma_f32_16x16x128_f8f6f4(global v8f* out, v16i a, v16i b, 
v8f c, int mod)
@@ -286,6 +289,9 @@ void test_amdgcn_swmmac_i32_16x16x128_iu8(global v8i* out, 
v8i a, v16i b, v8i c,
   *out = __builtin_amdgcn_swmmac_i32_16x16x128_iu8(0, a, mod, b, c, index, 
false, false); // expected-error {{'__builtin_amdgcn_swmmac_i32_16x16x128_iu8' 
must be a constant integer}}
   *out = __builtin_amdgcn_swmmac_i32_16x16x128_iu8(0, a, 0, b, c, index, mod, 
false); // expected-error {{'__builtin_amdgcn_swmmac_i32_16x16x128_iu8' must be 
a constant integer}}
   *out = __builtin_amdgcn_swmmac_i32_16x16x128_iu8(0, a, 0, b, c, index, 
false, mod); // expected-error {{'__builtin_amdgcn_swmmac_i32_16x16x128_iu8' 
must be a constant integer}}
+  *out = __builtin_amdgcn_swmmac_i32_16x16x128_iu8(0, a, 0, b, c, index, 
false, true, 32.0f); // expected-error {{integer constant expression must have 
integer type, not 'double'}}
+  *out = __builtin_amdgcn_swmmac_i32_16x16x128_iu8(0, a, 0, b, c, index, 
false, true, mod); // expected-error {{expression is not an integer constant 
expression}}
+  *out = __builtin_amdgcn_swmmac_i32_16x16x128_iu8(0, a, 0, b, c, index, 
false, true, true, 32.0f); // expected-error {{too many arguments to function 
call, expected at most 9, have 10}}
 }
 
 void test_amdgcn_swmmac_f32_16x16x64_f16(global v8f* out, v16h a, v32h b, v8f 
c, int index, int mod)
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td 
b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index f7d4b51c75168..2fb25bac43756 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -3902,7 +3902,7 @@ def int_amdgcn_global_store_async_from_lds_b128 :
   ClangBuiltin<"__builtin_amdgcn_global_store_async_from_lds_b128">, 
AMDGPUAsyncGlobalStoreFromLDS;
 
 // WMMA intrinsics.
-class AMDGPUWmmaIntrinsicModsAB<LLVMType AB, LLVMType CD> :
+class AMDGPUWmmaIntrinsicModsABClamp<LLVMType AB, LLVMType CD> :
   Intrinsic<
     [CD], // %D
     [
@@ -3913,8 +3913,9 @@ class AMDGPUWmmaIntrinsicModsAB<LLVMType AB, LLVMType CD> 
:
       LLVMMatchType<0>,               // %C
       llvm_i1_ty,       // matrix_a_reuse
       llvm_i1_ty,       // matrix_b_reuse
+      llvm_i1_ty,       // %clamp
     ],
-    [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, 
ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>,
+    [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, 
ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>, ImmArg<ArgIndex<7>>,
      IntrWillReturn, IntrNoCallback, IntrNoFree, IntrNoCreateUndefOrPoison]
 >;
 
@@ -4079,7 +4080,7 @@ def int_amdgcn_wmma_f32_16x16x128_fp8_fp8 : 
AMDGPUWmmaIntrinsicModsC<llvm_anyint
 def int_amdgcn_wmma_f32_16x16x128_fp8_bf8 : 
AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>;
 def int_amdgcn_wmma_f32_16x16x128_bf8_fp8 : 
AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>;
 def int_amdgcn_wmma_f32_16x16x128_bf8_bf8 : 
AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>;
-def int_amdgcn_wmma_i32_16x16x64_iu8      : 
AMDGPUWmmaIntrinsicModsAB<llvm_anyint_ty, llvm_anyint_ty>;
+def int_amdgcn_wmma_i32_16x16x64_iu8      : 
AMDGPUWmmaIntrinsicModsABClamp<llvm_anyint_ty, llvm_anyint_ty>;
 def int_amdgcn_wmma_f32_16x16x128_f8f6f4  : AMDGPUWmmaIntrinsicModsC_MatrixFMT;
 def int_amdgcn_wmma_scale_f32_16x16x128_f8f6f4   : 
AMDGPUWmmaScaleIntrinsicModsC<llvm_i32_ty>;
 def int_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4 : 
AMDGPUWmmaScaleIntrinsicModsC<llvm_i64_ty>;
@@ -4105,6 +4106,24 @@ class AMDGPUSWmmacIntrinsicABIdx<LLVMType A, LLVMType B, 
LLVMType CD, LLVMType I
      ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>, 
ImmArg<ArgIndex<7>>]
 >;
 
+class AMDGPUSWmmacIntrinsicABIdxClamp<LLVMType A, LLVMType B, LLVMType CD, 
LLVMType Index> :
+  Intrinsic<
+    [CD], // %D
+    [
+      llvm_i1_ty,       // %A_mod:  0 - none, 1 - neg
+      A,                // %A
+      llvm_i1_ty,       // %B_mod:  0 - none, 1 - neg
+      B,                // %B
+      LLVMMatchType<0>, // %C
+      Index,            // %Sparsity index for A
+      llvm_i1_ty,       // matrix_a_reuse
+      llvm_i1_ty,       // matrix_b_reuse
+      llvm_i1_ty,       // %clamp
+    ],
+    [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCreateUndefOrPoison,
+     ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>, 
ImmArg<ArgIndex<7>>, ImmArg<ArgIndex<8>>]
+>;
+
 defset list<Intrinsic> AMDGPUSWMMACIntrinsicsGFX1250 = {
 def int_amdgcn_swmmac_f32_16x16x64_f16      : 
AMDGPUSWmmacIntrinsicABIdx<llvm_anyfloat_ty, llvm_anyfloat_ty, 
llvm_anyfloat_ty, llvm_anyint_ty>;
 def int_amdgcn_swmmac_f32_16x16x64_bf16     : 
AMDGPUSWmmacIntrinsicABIdx<llvm_anyfloat_ty, llvm_anyfloat_ty, 
llvm_anyfloat_ty, llvm_anyint_ty>;
@@ -4119,7 +4138,7 @@ def int_amdgcn_swmmac_f16_16x16x128_fp8_fp8 : 
AMDGPUSWmmacIntrinsicIdxReuse<llvm
 def int_amdgcn_swmmac_f16_16x16x128_fp8_bf8 : 
AMDGPUSWmmacIntrinsicIdxReuse<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, 
llvm_anyint_ty>;
 def int_amdgcn_swmmac_f16_16x16x128_bf8_fp8 : 
AMDGPUSWmmacIntrinsicIdxReuse<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, 
llvm_anyint_ty>;
 def int_amdgcn_swmmac_f16_16x16x128_bf8_bf8 : 
AMDGPUSWmmacIntrinsicIdxReuse<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, 
llvm_anyint_ty>;
-def int_amdgcn_swmmac_i32_16x16x128_iu8     : 
AMDGPUSWmmacIntrinsicABIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, 
llvm_anyint_ty>;
+def int_amdgcn_swmmac_i32_16x16x128_iu8     : 
AMDGPUSWmmacIntrinsicABIdxClamp<llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, 
llvm_anyint_ty>;
 }
 
 
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index cbb7b6ee4f3f5..d8b54c396df28 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -32,6 +32,7 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsAArch64.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/IntrinsicsARM.h"
 #include "llvm/IR/IntrinsicsNVPTX.h"
 #include "llvm/IR/IntrinsicsRISCV.h"
@@ -1284,6 +1285,18 @@ static bool upgradeIntrinsicFunction1(Function *F, 
Function *&NewFn,
         break; // No other 'amdgcn.atomic.*'
       }
 
+      // Legacy wmma iu intrinsics without the optional clamp operand.
+      if (F->getIntrinsicID() == Intrinsic::amdgcn_wmma_i32_16x16x64_iu8 &&
+          F->arg_size() == 7) {
+        NewFn = nullptr;
+        return true;
+      }
+      if (F->getIntrinsicID() == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8 &&
+          F->arg_size() == 8) {
+        NewFn = nullptr;
+        return true;
+      }
+
       if (Name.consume_front("ds.") || Name.consume_front("global.atomic.") ||
           Name.consume_front("flat.atomic.")) {
         if (Name.starts_with("fadd") ||
@@ -4620,6 +4633,50 @@ static Value *upgradeARMIntrinsicCall(StringRef Name, 
CallBase *CI, Function *F,
 //
 static Value *upgradeAMDGCNIntrinsicCall(StringRef Name, CallBase *CI,
                                          Function *F, IRBuilder<> &Builder) {
+  // Legacy WMMA iu intrinsics missed the optional clamp operand. Append 
clamp=0
+  // for compatibility.
+  auto UpgradeLegacyWMMAIUIntrinsicCall =
+      [](Function *F, CallBase *CI, IRBuilder<> &Builder,
+         ArrayRef<Type *> OverloadTys) -> Value * {
+    // Prepare arguments, append clamp=0 for compatibility
+    SmallVector<Value *, 10> Args(CI->args().begin(), CI->args().end());
+    Args.push_back(Builder.getFalse());
+
+    // Insert the declaration for the right overload types
+    Function *NewDecl = Intrinsic::getOrInsertDeclaration(
+        F->getParent(), F->getIntrinsicID(), OverloadTys);
+
+    // Copy operand bundles if any
+    SmallVector<OperandBundleDef, 1> Bundles;
+    CI->getOperandBundlesAsDefs(Bundles);
+
+    // Create the new call and copy calling properties
+    auto *NewCall = cast<CallInst>(Builder.CreateCall(NewDecl, Args, Bundles));
+    NewCall->setTailCallKind(cast<CallInst>(CI)->getTailCallKind());
+    NewCall->setCallingConv(CI->getCallingConv());
+    NewCall->setAttributes(CI->getAttributes());
+    NewCall->setDebugLoc(CI->getDebugLoc());
+    NewCall->copyMetadata(*CI);
+    return NewCall;
+  };
+
+  if (F->getIntrinsicID() == Intrinsic::amdgcn_wmma_i32_16x16x64_iu8) {
+    assert(CI->arg_size() == 7 && "Legacy int_amdgcn_wmma_i32_16x16x64_iu8 "
+                                  "intrinsic should have 7 arguments");
+    Type *T1 = CI->getArgOperand(4)->getType();
+    Type *T2 = CI->getArgOperand(1)->getType();
+    return UpgradeLegacyWMMAIUIntrinsicCall(F, CI, Builder, {T1, T2});
+  }
+  if (F->getIntrinsicID() == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8) {
+    assert(CI->arg_size() == 8 && "Legacy int_amdgcn_swmmac_i32_16x16x128_iu8 "
+                                  "intrinsic should have 8 arguments");
+    Type *T1 = CI->getArgOperand(4)->getType();
+    Type *T2 = CI->getArgOperand(1)->getType();
+    Type *T3 = CI->getArgOperand(3)->getType();
+    Type *T4 = CI->getArgOperand(5)->getType();
+    return UpgradeLegacyWMMAIUIntrinsicCall(F, CI, Builder, {T1, T2, T3, T4});
+  }
+
   AtomicRMWInst::BinOp RMWOp =
       StringSwitch<AtomicRMWInst::BinOp>(Name)
           .StartsWith("ds.fadd", AtomicRMWInst::FAdd)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 3dd8dd2b90b4a..fc9aa76c39ccb 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -10248,11 +10248,13 @@ SDValue 
SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
 
     SDLoc SL(Op);
     auto IndexKey = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, IndexKeyTy);
-    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
-                       {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
-                        Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
-                        IndexKey, Op.getOperand(7),
-                        Op.getOperand(8)}); // No clamp operand
+    SmallVector<SDValue> Args{
+        Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
+        Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
+        IndexKey,         Op.getOperand(7), Op.getOperand(8)};
+    if (IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8)
+      Args.push_back(Op.getOperand(9));
+    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(), Args);
   }
   case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
   case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td 
b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 8f42c1ad84b15..03d4699ec4c30 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -1463,7 +1463,7 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit 
_IsSWMMAC, int _IndexType,
   let HasMatrixReuse = _HasMatrixReuse;
 
   bit HasIModOp = _Has_ImodOp;
-  let HasClamp = !and(IsIU, !not(HasIModOp));
+  let HasClamp = IsIU;
   let IsPacked = 1;
   let IsWMMA = !not(_IsSWMMAC);
   let IsSWMMAC = _IsSWMMAC;
@@ -1568,7 +1568,7 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit 
_IsSWMMAC, int _IndexType,
                                              
MatrixAScaleFmt:$matrix_a_scale_fmt, MatrixBScaleFmt:$matrix_b_scale_fmt),
                                         (ins));
   dag MatrixReuse = !if(HasMatrixReuse, (ins MatrixAReuse:$matrix_a_reuse, 
MatrixBReuse:$matrix_b_reuse), (ins));
-  dag Clamp = !if(HasClamp, (ins Clamp0:$clamp), (ins));
+  dag ClampOp = !if(HasClamp, (ins Clamp:$clamp), (ins));
   dag Neg = !cond(!and(NegLoAny, NegHiAny)             : (ins neg_lo0:$neg_lo, 
neg_hi0:$neg_hi),
                   !and(NegLoAny, !not(NegHiAny))       : (ins neg_lo0:$neg_lo),
                   !and(!not(NegLoAny), !not(NegHiAny)) : (ins));
@@ -1580,7 +1580,7 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit 
_IsSWMMAC, int _IndexType,
                                                  (ins VRegSrc_64:$src2),
                                                  (ins VRegSrc_32:$src2)),
                                             IndexKey)),
-                      MatrixScaleSrc, MatrixFMT, MatrixScale, MatrixReuse, 
Clamp, Neg);
+                      MatrixScaleSrc, ClampOp, MatrixFMT, MatrixScale, 
MatrixReuse, Neg);
 
   // asm
 
@@ -1597,7 +1597,7 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit 
_IsSWMMAC, int _IndexType,
                         !and(NegLoAny, !not(NegHiAny))       : "$neg_lo",
                         !and(!not(NegLoAny), !not(NegHiAny)) : "");
 
-  let AsmVOP3P = "$vdst, $src0, $src1, 
$src2"#IndexKeyAsm#MatrixScaleSrcAsm#MatrxFMTAsm#MatrixScaleAsm#MatrixReuseAsm#NegAsm#ClampAsm;
+  let AsmVOP3P = "$vdst, $src0, $src1, 
$src2"#IndexKeyAsm#MatrixScaleSrcAsm#ClampAsm#MatrxFMTAsm#MatrixScaleAsm#MatrixReuseAsm#NegAsm;
 
   // isel patterns
   bit IsAB_BF16_IMod0 = !and(IsAB_BF16, !not(HasIModOp));
@@ -1669,17 +1669,17 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit 
_IsSWMMAC, int _IndexType,
   dag MatrixReuseOutModPat = !if(HasMatrixReuse, (ins i1:$matrix_a_reuse, 
i1:$matrix_b_reuse), (ins));
 
   dag WmmaInPat  = !con(Src0InPat, Src1InPat, Src2InPatWmma, MatrixScaleInPat, 
MatrixReuseInPat, ClampPat);
-  dag WmmaOutPat = !con(Src0OutPat, Src1OutPat, Src2OutPatWmma, 
MatrixScaleOutSrcPat, MatrixFMTOutPat,
-                        MatrixScaleOutModPat, MatrixReuseOutModPat, ClampPat);
+  dag WmmaOutPat = !con(Src0OutPat, Src1OutPat, Src2OutPatWmma, 
MatrixScaleOutSrcPat, ClampPat, MatrixFMTOutPat,
+                        MatrixScaleOutModPat, MatrixReuseOutModPat);
 
   dag SwmmacInPat  = !con(Src0InPat, Src1InPat, (ins Src2VT:$srcTiedDef), 
IndexInPat, MatrixReuseInPat, ClampPat);
-  dag SwmmacOutPat = !con(Src0OutPat, Src1OutPat, (ins Src2VT:$srcTiedDef), 
IndexOutPat, MatrixReuseOutModPat, ClampPat);
+  dag SwmmacOutPat = !con(Src0OutPat, Src1OutPat, (ins Src2VT:$srcTiedDef), 
IndexOutPat, ClampPat, MatrixReuseOutModPat);
 
   // wmma pattern where src2 is inline imm uses _threeaddr pseudo,
   // can't use _twoaddr since it would violate src2 tied to vdst constraint.
   dag WmmaInlineInPat  = !con(Src0InPat, Src1InPat, Src2InlineInPat, 
MatrixScaleInPat, MatrixReuseInPat, ClampPat);
-  dag WmmaInlineOutPat = !con(Src0OutPat, Src1OutPat, Src2InlineOutPat, 
MatrixScaleOutSrcPat,
-                              MatrixFMTOutPat, MatrixScaleOutModPat, 
MatrixReuseOutModPat, ClampPat);
+  dag WmmaInlineOutPat = !con(Src0OutPat, Src1OutPat, Src2InlineOutPat, 
MatrixScaleOutSrcPat, ClampPat,
+                              MatrixFMTOutPat, MatrixScaleOutModPat, 
MatrixReuseOutModPat);
 }
 
 def WMMAInstInfoTable : GenericTable {
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll 
b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
index d5c6000a1eef6..db6e294cb90fe 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
@@ -295,9 +295,9 @@ define amdgpu_kernel void @wmma_f16_16x16x64_bf8_bf8(<8 x 
i32> %A, <8 x i32> %B,
   ret void
 }
 
-; CHECK: DIVERGENT: %tmp0 = call <8 x i32> 
@llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 false, <8 x i32> %A, i1 
false, <8 x i32> %B, <8 x i32> %C, i1 false, i1 false)
+; CHECK: DIVERGENT: %tmp0 = call <8 x i32> 
@llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 false, <8 x i32> %A, i1 
false, <8 x i32> %B, <8 x i32> %C, i1 false, i1 false, i1 false)
 define amdgpu_kernel void @wmma_i32_16x16x64_iu8(<8 x i32> %A, <8 x i32> %B, 
<8 x i32> %C, ptr addrspace(1) %out) {
-  %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 0, 
<8 x i32> %A, i1 0, <8 x i32> %B, <8 x i32> %C, i1 false, i1 false)
+  %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 0, 
<8 x i32> %A, i1 0, <8 x i32> %B, <8 x i32> %C, i1 false, i1 false, i1 false)
   store <8 x i32> %tmp0, ptr addrspace(1) %out
   ret void
 }
@@ -417,9 +417,9 @@ define amdgpu_ps void @swmmac_f16_16x16x128_bf8_bf8(<8 x 
i32> %A, <16 x i32> %B,
   ret void
 }
 
-; CHECK: DIVERGENT:   %tmp0 = call <8 x i32> 
@llvm.amdgcn.swmmac.i32.16x16x128.iu8.v8i32.v8i32.v16i32.i16(i1 false, <8 x 
i32> %A, i1 false, <16 x i32> %B, <8 x i32> %C, i16 %Index, i1 false, i1 false)
+; CHECK: DIVERGENT:   %tmp0 = call <8 x i32> 
@llvm.amdgcn.swmmac.i32.16x16x128.iu8.v8i32.v8i32.v16i32.i16(i1 false, <8 x 
i32> %A, i1 false, <16 x i32> %B, <8 x i32> %C, i16 %Index, i1 false, i1 false, 
i1 false)
 define amdgpu_ps void @swmmac_i32_16x16x128_iu8(<8 x i32> %A, <16 x i32> %B, 
<8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-  %tmp0 = call <8 x i32> 
@llvm.amdgcn.swmmac.i32.16x16x128.iu8.v8i32.v8i32.v16i32.i16(i1 0, <8 x i32> 
%A, i1 0, <16 x i32> %B, <8 x i32> %C, i16 %Index, i1 false, i1 false)
+  %tmp0 = call <8 x i32> 
@llvm.amdgcn.swmmac.i32.16x16x128.iu8.v8i32.v8i32.v16i32.i16(i1 0, <8 x i32> 
%A, i1 0, <16 x i32> %B, <8 x i32> %C, i16 %Index, i1 false, i1 false, i1 false)
   store <8 x i32> %tmp0, ptr addrspace(1) %out
   ret void
 }
@@ -903,7 +903,7 @@ declare <8 x half> 
@llvm.amdgcn.wmma.f16.16x16x64.fp8.fp8.v8f16.v8i32(<8 x i32>,
 declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.bf8.v8f16.v8i32(<8 x 
i32>, <8 x i32>, i16, <8 x half>, i1, i1)
 declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.fp8.v8f16.v8i32(<8 x 
i32>, <8 x i32>, i16, <8 x half>, i1, i1)
 declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x 
i32>, <8 x i32>, i16, <8 x half>, i1, i1)
-declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 immarg, <8 
x i32>, i1 immarg, <8 x i32>, <8 x i32>, i1, i1)
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 immarg, <8 
x i32>, i1 immarg, <8 x i32>, <8 x i32>, i1, i1, i1)
 declare <8 x float> 
@llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32, <16 x i32>, 
i32, <16 x i32>, i16, <8 x float>)
 declare <8 x float> 
@llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32, <16 x 
i32>, i32, <16 x i32>, i16, <8 x float>, i32, i32, i32, i32, i32, i32, i1, i1)
 declare <8 x float> 
@llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32, <16 x 
i32>, i32, <16 x i32>, i16, <8 x float>, i32, i32, i64, i32, i32, i64, i1, i1)
@@ -920,7 +920,7 @@ declare <8 x half> 
@llvm.amdgcn.swmmac.f16.16x16x128.fp8.fp8.v8f16.v8i32.v16i32.
 declare <8 x half> 
@llvm.amdgcn.swmmac.f16.16x16x128.fp8.bf8.v8f16.v8i32.v16i32.i16(<8 x i32>, <16 
x i32>, <8 x half>, i16, i1, i1)
 declare <8 x half> 
@llvm.amdgcn.swmmac.f16.16x16x128.bf8.fp8.v8f16.v8i32.v16i32.i16(<8 x i32>, <16 
x i32>, <8 x half>, i16, i1, i1)
 declare <8 x half> 
@llvm.amdgcn.swmmac.f16.16x16x128.bf8.bf8.v8f16.v8i32.v16i32.i16(<8 x i32>, <16 
x i32>, <8 x half>, i16, i1, i1)
-declare <8 x i32> 
@llvm.amdgcn.swmmac.i32.16x16x128.iu8.v8i32.v8i32.v16i32.i16(i1 immarg, <8 x 
i32>, i1 immarg, <16 x i32>, <8 x i32>, i16 %Index, i1, i1)
+declare <8 x i32> 
@llvm.amdgcn.swmmac.i32.16x16x128.iu8.v8i32.v8i32.v16i32.i16(i1 immarg, <8 x 
i32>, i1 immarg, <16 x i32>, <8 x i32>, i16 %Index, i1, i1, i1)
 
 declare <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32(ptr addrspace(1))
 declare <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16(ptr addrspace(1))
diff --git a/llvm/test/Bitcode/amdgpu-wmma-clamp-upgrade.ll 
b/llvm/test/Bitcode/amdgpu-wmma-clamp-upgrade.ll
new file mode 100644
index 0000000000000..79d1f46bc346c
--- /dev/null
+++ b/llvm/test/Bitcode/amdgpu-wmma-clamp-upgrade.ll
@@ -0,0 +1,25 @@
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+
+define amdgpu_ps void @swmmac_i32_16x16x128_iu8(<8 x i32> %A, <16 x i32> %B, 
<8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_ps void @swmmac_i32_16x16x128_iu8(
+; CHECK-SAME: <8 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], 
i16 [[INDEX:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i32> 
@llvm.amdgcn.swmmac.i32.16x16x128.iu8.v8i32.v8i32.v16i32.i16(i1 false, <8 x 
i32> [[A]], i1 false, <16 x i32> [[B]], <8 x i32> [[C]], i16 [[INDEX]], i1 
false, i1 false, i1 false)
+; CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr addrspace(1) [[OUT]], align 32
+; CHECK-NEXT:    ret void
+;
+  %tmp0 = call <8 x i32> 
@llvm.amdgcn.swmmac.i32.16x16x128.iu8.v8i32.v8i32.v16i32.i16(i1 0, <8 x i32> 
%A, i1 0, <16 x i32> %B, <8 x i32> %C, i16 %Index, i1 false, i1 false)
+  store <8 x i32> %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @wmma_i32_16x16x64_iu8(<8 x i32> %A, <8 x i32> %B, 
<8 x i32> %C, ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @wmma_i32_16x16x64_iu8(
+; CHECK-SAME: <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], 
ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i32> 
@llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 false, <8 x i32> [[A]], i1 
false, <8 x i32> [[B]], <8 x i32> [[C]], i1 false, i1 false, i1 false)
+; CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr addrspace(1) [[OUT]], align 32
+; CHECK-NEXT:    ret void
+;
+  %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 0, 
<8 x i32> %A, i1 0, <8 x i32> %B, <8 x i32> %C, i1 false, i1 false)
+  store <8 x i32> %tmp0, ptr addrspace(1) %out
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-coececution-valu-hazards.mir 
b/llvm/test/CodeGen/AMDGPU/wmma-coececution-valu-hazards.mir
index 2f7a6e257bb96..fb9398180e4f7 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-coececution-valu-hazards.mir
+++ b/llvm/test/CodeGen/AMDGPU/wmma-coececution-valu-hazards.mir
@@ -319,7 +319,7 @@ name: test_wmma_I32_16x16x64_IU8_D0_overlaps_Use1
 body: |
   bb.0:
     ; GFX1250-LABEL: name: test_wmma_I32_16x16x64_IU8_D0_overlaps_Use1
-    ; GFX1250: early-clobber 
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = 
V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed 
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit 
$exec
+    ; GFX1250: early-clobber 
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = 
V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed 
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit 
$exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
@@ -329,7 +329,7 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: $vgpr32 = V_ADD_F32_e32 $vgpr16, $vgpr33, implicit $mode, 
implicit $exec
-    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = 
V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed 
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit 
$exec
+    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = 
V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed 
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit 
$exec
     $vgpr32 = V_ADD_F32_e32 $vgpr16, $vgpr33, implicit $mode, implicit $exec
 ...
 
@@ -338,7 +338,7 @@ name: 
test_wmma_I32_16x16x64_IU8_D0_overlaps_Use1_with_8_valus_in_between
 body: |
   bb.0:
     ; GFX1250-LABEL: name: 
test_wmma_I32_16x16x64_IU8_D0_overlaps_Use1_with_8_valus_in_between
-    ; GFX1250: early-clobber 
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = 
V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed 
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit 
$exec
+    ; GFX1250: early-clobber 
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = 
V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed 
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit 
$exec
     ; GFX1250-NEXT: $vgpr40 = V_MOV_B32_e32 40, implicit $exec
     ; GFX1250-NEXT: $vgpr41 = V_MOV_B32_e32 41, implicit $exec
     ; GFX1250-NEXT: $vgpr42 = V_MOV_B32_e32 42, implicit $exec
@@ -348,7 +348,7 @@ body: |
     ; GFX1250-NEXT: $vgpr46 = V_MOV_B32_e32 46, implicit $exec
     ; GFX1250-NEXT: $vgpr47 = V_MOV_B32_e32 47, implicit $exec
     ; GFX1250-NEXT: $vgpr32 = V_ADD_F32_e32 $vgpr16, $vgpr33, implicit $mode, 
implicit $exec
-    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = 
V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed 
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit 
$exec
+    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = 
V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed 
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit 
$exec
     $vgpr40 = V_MOV_B32_e32 40, implicit $exec
     $vgpr41 = V_MOV_B32_e32 41, implicit $exec
     $vgpr42 = V_MOV_B32_e32 42, implicit $exec
@@ -365,7 +365,7 @@ name: 
test_wmma_I32_16x16x64_IU8_D0_overlaps_Use1_with_8_salus_in_between
 body: |
   bb.0:
     ; GFX1250-LABEL: name: 
test_wmma_I32_16x16x64_IU8_D0_overlaps_Use1_with_8_salus_in_between
-    ; GFX1250: early-clobber 
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = 
V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed 
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit 
$exec
+    ; GFX1250: early-clobber 
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = 
V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed 
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit 
$exec
     ; GFX1250-NEXT: $sgpr0 = S_MOV_B32 0
     ; GFX1250-NEXT: $sgpr1 = S_MOV_B32 1
     ; GFX1250-NEXT: $sgpr2 = S_MOV_B32 2
@@ -383,7 +383,7 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: $vgpr32 = V_ADD_F32_e32 $vgpr16, $vgpr33, implicit $mode, 
implicit $exec
-    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = 
V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed 
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit 
$exec
+    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = 
V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed 
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit 
$exec
     $sgpr0 = S_MOV_B32 0
     $sgpr1 = S_MOV_B32 1
     $sgpr2 = S_MOV_B32 2
@@ -400,7 +400,7 @@ name: test_wmma_I32_16x16x64_IU8_D0_overlaps_D1
 body: |
   bb.0:
     ; GFX1250-LABEL: name: test_wmma_I32_16x16x64_IU8_D0_overlaps_D1
-    ; GFX1250: early-clobber 
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = 
V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed 
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit 
$exec
+    ; GFX1250: early-clobber 
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = 
V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed 
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit 
$exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
@@ -410,7 +410,7 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: $vgpr16 = V_ADD_F32_e32 $vgpr32, $vgpr33, implicit $mode, 
implicit $exec
-    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = 
V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed 
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit 
$exec
+    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = 
V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed 
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit 
$exec
     $vgpr16 = V_ADD_F32_e32 $vgpr32, $vgpr33, implicit $mode, implicit $exec
 ...
 
@@ -419,7 +419,7 @@ name: test_wmma_I32_16x16x64_IU8_A0_overlaps_D1
 body: |
   bb.0:
     ; GFX1250-LABEL: name: test_wmma_I32_16x16x64_IU8_A0_overlaps_D1
-    ; GFX1250: early-clobber 
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = 
V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed 
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit 
$exec
+    ; GFX1250: early-clobber 
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = 
V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed 
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit 
$exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
@@ -429,7 +429,7 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr32, $vgpr33, implicit $mode, 
implicit $exec
-    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = 
V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed 
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit 
$exec
+    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = 
V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed 
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit 
$exec
     $vgpr0 = V_ADD_F32_e32 $vgpr32, $vgpr33, implicit $mode, implicit $exec
 ...
 
@@ -438,7 +438,7 @@ name: test_wmma_I32_16x16x64_IU8_B0_overlaps_D1
 body: |
   bb.0:
     ; GFX1250-LABEL: name: test_wmma_I32_16x16x64_IU8_B0_overlaps_D1
-    ; GFX1250: early-clobber 
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = 
V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed 
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit 
$exec
+    ; GFX1250: early-clobber 
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = 
V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed 
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit 
$exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
@@ -448,7 +448,7 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: $vgpr8 = V_ADD_F32_e32 $vgpr32, $vgpr33, implicit $mode, 
implicit $exec
-    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = 
V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed 
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit 
$exec
+    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = 
V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed 
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit 
$exec
     $vgpr8 = V_ADD_F32_e32 $vgpr32, $vgpr33, implicit $mode, implicit $exec
 ...
 
@@ -789,13 +789,13 @@ name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_Use1
 body: |
   bb.0:
     ; GFX1250-LABEL: name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_Use1
-    ; GFX1250: early-clobber 
$vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = 
V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23,
 killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed 
$vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+    ; GFX1250: early-clobber 
$vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = 
V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23,
 killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed 
$vgpr32_vgpr33, 0, 0, 0, 0, 0, implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: $vgpr34 = V_ADD_F32_e32 $vgpr33, $vgpr24, implicit $mode, 
implicit $exec
-    $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = 
V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23,
 killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed 
$vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = 
V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23,
 killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed 
$vgpr32_vgpr33, 0, 0, 0, 0, 0, implicit $exec
     $vgpr34 = V_ADD_F32_e32 $vgpr33, $vgpr24, implicit $mode, implicit $exec
 ...
 
@@ -804,13 +804,13 @@ name: 
test_swmmac_i32_16x16x128_iu8_D0_overlaps_Use1_with_4_valus_in_between
 body: |
   bb.0:
     ; GFX1250-LABEL: name: 
test_swmmac_i32_16x16x128_iu8_D0_overlaps_Use1_with_4_valus_in_between
-    ; GFX1250: early-clobber 
$vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = 
V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23,
 killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed 
$vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+    ; GFX1250: early-clobber 
$vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = 
V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23,
 killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed 
$vgpr32_vgpr33, 0, 0, 0, 0, 0, implicit $exec
     ; GFX1250-NEXT: $vgpr40 = V_MOV_B32_e32 40, implicit $exec
     ; GFX1250-NEXT: $vgpr41 = V_MOV_B32_e32 41, implicit $exec
     ; GFX1250-NEXT: $vgpr42 = V_MOV_B32_e32 42, implicit $exec
     ; GFX1250-NEXT: $vgpr43 = V_MOV_B32_e32 43, implicit $exec
     ; GFX1250-NEXT: $vgpr34 = V_ADD_F32_e32 $vgpr34, $vgpr24, implicit $mode, 
implicit $exec
-    $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = 
V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23,
 killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed 
$vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = 
V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23,
 killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed 
$vgpr32_vgpr33, 0, 0, 0, 0, 0, implicit $exec
     $vgpr40 = V_MOV_B32_e32 40, implicit $exec
     $vgpr41 = V_MOV_B32_e32 41, implicit $exec
     $vgpr42 = V_MOV_B32_e32 42, implicit $exec
@@ -823,7 +823,7 @@ name: 
test_swmmac_i32_16x16x128_iu8_D0_overlaps_Use1_with_4_salus_in_between
 body: |
   bb.0:
     ; GFX1250-LABEL: name: 
test_swmmac_i32_16x16x128_iu8_D0_overlaps_Use1_with_4_salus_in_between
-    ; GFX1250: early-clobber 
$vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = 
V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23,
 killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed 
$vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+    ; GFX1250: early-clobber 
$vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = 
V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23,
 killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed 
$vgpr32_vgpr33, 0, 0, 0, 0, 0, implicit $exec
     ; GFX1250-NEXT: $sgpr0 = S_MOV_B32 0
     ; GFX1250-NEXT: $sgpr1 = S_MOV_B32 1
     ; GFX1250-NEXT: $sgpr2 = S_MOV_B32 2
@@ -833,7 +833,7 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: $vgpr34 = V_ADD_F32_e32 $vgpr34, $vgpr24, implicit $mode, 
implicit $exec
-    $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = 
V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23,
 killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed 
$vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = 
V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23,
 killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed 
$vgpr32_vgpr33, 0, 0, 0, 0, 0, implicit $exec
     $sgpr0 = S_MOV_B32 0
     $sgpr1 = S_MOV_B32 1
     $sgpr2 = S_MOV_B32 2
@@ -846,13 +846,13 @@ name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_D1
 body: |
   bb.0:
     ; GFX1250-LABEL: name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_D1
-    ; GFX1250: early-clobber 
$vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = 
V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23,
 killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed 
$vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+    ; GFX1250: early-clobber 
$vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = 
V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23,
 killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed 
$vgpr32_vgpr33, 0, 0, 0, 0, 0, implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: $vgpr24 = V_ADD_F32_e32 $vgpr34, $vgpr35, implicit $mode, 
implicit $exec
-    $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = 
V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23,
 killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed 
$vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = 
V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23,
 killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed 
$vgpr32_vgpr33, 0, 0, 0, 0, 0, implicit $exec
     $vgpr24 = V_ADD_F32_e32 $vgpr34, $vgpr35, implicit $mode, implicit $exec
 ...
 
@@ -861,13 +861,13 @@ name: test_swmmac_i32_16x16x128_iu8_A0_overlaps_D1
 body: |
   bb.0:
     ; GFX1250-LABEL: name: test_swmmac_i32_16x16x128_iu8_A0_overlaps_D1
-    ; GFX1250: early-clobber 
$vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = 
V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23,
 killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed 
$vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+    ; GFX1250: early-clobber 
$vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = 
V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23,
 killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed 
$vgpr32_vgpr33, 0, 0, 0, 0, 0, implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr34, $vgpr35, implicit $mode, 
implicit $exec
-    $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = 
V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23,
 killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed 
$vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = 
V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23,
 killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed 
$vgpr32_vgpr33, 0, 0, 0, 0, 0, implicit $exec
     $vgpr0 = V_ADD_F32_e32 $vgpr34, $vgpr35, implicit $mode, implicit $exec
 ...
 
@@ -876,13 +876,13 @@ name: test_swmmac_i32_16x16x128_iu8_B0_overlaps_D1
 body: |
   bb.0:
     ; GFX1250-LABEL: name: test_swmmac_i32_16x16x128_iu8_B0_overlaps_D1
-    ; GFX1250: early-clobber 
$vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = 
V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23,
 killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed 
$vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+    ; GFX1250: early-clobber 
$vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = 
V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23,
 killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed 
$vgpr32_vgpr33, 0, 0, 0, 0, 0, implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: $vgpr8 = V_ADD_F32_e32 $vgpr34, $vgpr35, implicit $mode, 
implicit $exec
-    $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = 
V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23,
 killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed 
$vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = 
V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23,
 killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed 
$vgpr32_vgpr33, 0, 0, 0, 0, 0, implicit $exec
     $vgpr8 = V_ADD_F32_e32 $vgpr34, $vgpr35, implicit $mode, implicit $exec
 ...
 
@@ -891,12 +891,12 @@ name: test_swmmac_i32_16x16x128_iu8_Index0_overlaps_D1
 body: |
   bb.0:
     ; GFX1250-LABEL: name: test_swmmac_i32_16x16x128_iu8_Index0_overlaps_D1
-    ; GFX1250: early-clobber 
$vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = 
V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23,
 killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed 
$vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+    ; GFX1250: early-clobber 
$vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = 
V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23,
 killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed 
$vgpr32_vgpr33, 0, 0, 0, 0, 0, implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: $vgpr32 = V_ADD_F32_e32 $vgpr34, $vgpr35, implicit $mode, 
implicit $exec
-    $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = 
V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23,
 killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed 
$vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = 
V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23,
 killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed 
$vgpr32_vgpr33, 0, 0, 0, 0, 0, implicit $exec
     $vgpr32 = V_ADD_F32_e32 $vgpr34, $vgpr35, implicit $mode, implicit $exec
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx1250-w32.mir 
b/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx1250-w32.mir
index fa3b9244c3e4a..aaccc179596e7 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx1250-w32.mir
+++ b/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx1250-w32.mir
@@ -574,7 +574,7 @@ name: test_wmma_I32_16x16x64_IU8_D0_overlaps_A1
 body: |
   bb.0:
     ; GFX1250-LABEL: name: test_wmma_I32_16x16x64_IU8_D0_overlaps_A1
-    ; GFX1250: early-clobber 
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = 
V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed 
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit 
$exec
+    ; GFX1250: early-clobber 
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = 
V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed 
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit 
$exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
@@ -584,9 +584,9 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
-    ; GFX1250-NEXT: early-clobber 
$vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = 
V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed 
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed 
$vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed 
$vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, implicit 
$exec
-    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = 
V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed 
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit 
$exec
-    $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = 
V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed 
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed 
$vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed 
$vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, implicit 
$exec
+    ; GFX1250-NEXT: early-clobber 
$vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = 
V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed 
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed 
$vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed 
$vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit 
$exec
+    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = 
V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed 
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit 
$exec
+    $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = 
V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed 
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed 
$vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed 
$vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit 
$exec
 ...
 
 ---
@@ -594,7 +594,7 @@ name: test_wmma_I32_16x16x64_IU8_D0_overlaps_B1
 body: |
   bb.0:
     ; GFX1250-LABEL: name: test_wmma_I32_16x16x64_IU8_D0_overlaps_B1
-    ; GFX1250: early-clobber 
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = 
V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed 
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit 
$exec
+    ; GFX1250: early-clobber 
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = 
V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed 
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit 
$exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
@@ -604,9 +604,9 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
-    ; GFX1250-NEXT: early-clobber 
$vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = 
V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed 
$vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, killed 
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed 
$vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, implicit 
$exec
-    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = 
V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed 
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit 
$exec
-    $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = 
V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed 
$vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, killed 
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed 
$vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, implicit 
$exec
+    ; GFX1250-NEXT: early-clobber 
$vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = 
V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed 
$vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, killed 
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed 
$vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit 
$exec
+    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = 
V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed 
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit 
$exec
+    $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = 
V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed 
$vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, killed 
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed 
$vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit 
$exec
 ...
 
 ---
@@ -614,7 +614,7 @@ name: test_wmma_I32_16x16x64_IU8_D0_overlaps_Index1
 body: |
   bb.0:
     ; GFX1250-LABEL: name: test_wmma_I32_16x16x64_IU8_D0_overlaps_Index1
-    ; GFX1250: early-clobber 
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = 
V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed 
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit 
$exec
+    ; GFX1250: early-clobber 
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = 
V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed 
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit 
$exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
@@ -625,7 +625,7 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber 
$vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = 
V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed 
$vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed 
$vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55,
 killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed 
$vgpr16_vgpr17, 0, 0, 0, implicit $exec
-    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = 
V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed 
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit 
$exec
+    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = 
V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed 
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit 
$exec
     $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = 
V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed 
$vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed 
$vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55,
 killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed 
$vgpr16_vgpr17, 0, 0, 0, implicit $exec
 ...
 
@@ -1518,15 +1518,15 @@ name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_A1
 body: |
   bb.0:
     ; GFX1250-LABEL: name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_A1
-    ; GFX1250: early-clobber 
$vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = 
V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23,
 killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed 
$vgpr88_vgpr89, 0, 0, 0, 0, implicit $exec
+    ; GFX1250: early-clobber 
$vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = 
V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23,
 killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed 
$vgpr88_vgpr89, 0, 0, 0, 0, 0, implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
-    ; GFX1250-NEXT: early-clobber 
$vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 = 
V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed 
$vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed 
$vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47,
 killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed 
$vgpr90_vgpr91, 0, 0, 0, 0, implicit $exec
-    $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = 
V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23,
 killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed 
$vgpr88_vgpr89, 0, 0, 0, 0, implicit $exec
-    $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 = 
V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed 
$vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed 
$vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47,
 killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed 
$vgpr90_vgpr91, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: early-clobber 
$vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 = 
V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed 
$vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed 
$vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47,
 killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed 
$vgpr90_vgpr91, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = 
V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23,
 killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed 
$vgpr88_vgpr89, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 = 
V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed 
$vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed 
$vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47,
 killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed 
$vgpr90_vgpr91, 0, 0, 0, 0, 0, implicit $exec
 ...
 
 ---
@@ -1534,15 +1534,15 @@ name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_B1
 body: |
   bb.0:
     ; GFX1250-LABEL: name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_B1
-    ; GFX1250: early-clobber 
$vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = 
V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23,
 killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed 
$vgpr88_vgpr89, 0, 0, 0, 0, implicit $exec
+    ; GFX1250: early-clobber 
$vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = 
V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23,
 killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed 
$vgpr88_vgpr89, 0, 0, 0, 0, 0, implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
-    ; GFX1250-NEXT: early-clobber 
$vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 = 
V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed 
$vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed 
$vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39,
 killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed 
$vgpr90_vgpr91, 0, 0, 0, 0, implicit $exec
-    $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = 
V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23,
 killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed 
$vgpr88_vgpr89, 0, 0, 0, 0, implicit $exec
-    $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 = 
V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed 
$vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed 
$vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39,
 killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed 
$vgpr90_vgpr91, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: early-clobber 
$vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 = 
V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed 
$vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed 
$vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39,
 killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed 
$vgpr90_vgpr91, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = 
V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23,
 killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed 
$vgpr88_vgpr89, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 = 
V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed 
$vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed 
$vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39,
 killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed 
$vgpr90_vgpr91, 0, 0, 0, 0, 0, implicit $exec
 ...
 
 ---
@@ -1550,14 +1550,14 @@ name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_Index1
 body: |
   bb.0:
     ; GFX1250-LABEL: name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_Index1
-    ; GFX1250: early-clobber 
$vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = 
V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23,
 killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed 
$vgpr88_vgpr89, 0, 0, 0, 0, implicit $exec
+    ; GFX1250: early-clobber 
$vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = 
V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23,
 killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed 
$vgpr88_vgpr89, 0, 0, 0, 0, 0, implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber 
$vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = 
V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed 
$vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed 
$vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55,
 killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed 
$vgpr24_vgpr25, 0, 0, 0, implicit $exec
-    $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = 
V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23,
 killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed 
$vgpr88_vgpr89, 0, 0, 0, 0, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = 
V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed 
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed 
$vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23,
 killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed 
$vgpr88_vgpr89, 0, 0, 0, 0, 0, implicit $exec
     $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = 
V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed 
$vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed 
$vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55,
 killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed 
$vgpr24_vgpr25, 0, 0, 0, implicit $exec
 ...
 
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32.s 
b/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32.s
index fcfff9ac5b63d..46bc887374e5c 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32.s
@@ -493,6 +493,11 @@ v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], 
v[16:23] matrix_b_reuse
 // GFX1250: v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] 
matrix_b_reuse ; encoding: [0x10,0x40,0x72,0xcc,0x00,0x11,0x42,0x1c]
 // WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
+v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] clamp
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] clamp 
; encoding: [0x10,0x80,0x72,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
+
 v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23]
 // GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] ; 
encoding: [0x10,0x00,0x60,0xcc,0x00,0x11,0x42,0x1c]
@@ -863,6 +868,11 @@ v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], 
v[32:33] matrix_b_reuse
 // GFX1250: v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] 
matrix_b_reuse ; encoding: [0x18,0x40,0x7b,0xcc,0x00,0x11,0x82,0x1c]
 // WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
+v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] clamp
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] 
clamp ; encoding: [0x18,0x80,0x7b,0xcc,0x00,0x11,0x82,0x1c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
+
 v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32
 // GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 ; 
encoding: [0x18,0x00,0x65,0xcc,0x00,0x11,0x82,0x1c]
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32_err.s 
b/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32_err.s
index 41cac9d1470ae..f484448635f37 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32_err.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32_err.s
@@ -126,9 +126,6 @@ v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], s[16:23]
 v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], 128
 // GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 
-v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] clamp
-// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-
 v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1]
 // GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
 
@@ -330,9 +327,6 @@ v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], 
s[32:33]
 v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], 1, v[32:33]
 // GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 
-v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] clamp
-// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-
 v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:2
 // GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_wmma_w32.txt 
b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_wmma_w32.txt
index 5d73cbd512edb..1dea7e138009e 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_wmma_w32.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_wmma_w32.txt
@@ -199,6 +199,9 @@
 0x18,0x00,0x7b,0xcc,0x00,0x11,0x82,0x3c
 # GFX1250: v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] 
neg_lo:[1,0,0] ; encoding: [0x18,0x00,0x7b,0xcc,0x00,0x11,0x82,0x3c]
 
+0x18,0x80,0x7b,0xcc,0x00,0x11,0x82,0x1c
+# GFX1250: v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] 
clamp ; encoding: [0x18,0x80,0x7b,0xcc,0x00,0x11,0x82,0x1c]
+
 0x10,0x00,0x63,0xcc,0x00,0x11,0xca,0x1b
 # GFX1250: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], 1.0 ; 
encoding: [0x10,0x00,0x63,0xcc,0x00,0x11,0xca,0x1b]
 
@@ -586,6 +589,9 @@
 0x10,0x00,0x72,0xcc,0x00,0x11,0x42,0x3c
 # GFX1250: v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] 
neg_lo:[1,0,0] ; encoding: [0x10,0x00,0x72,0xcc,0x00,0x11,0x42,0x3c]
 
+0x10,0x80,0x72,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] clamp ; 
encoding: [0x10,0x80,0x72,0xcc,0x00,0x11,0x42,0x1c]
+
 0x00,0x00,0x3a,0xcc,0x82,0x88,0x01,0x04
 # GFX1250: v_wmma_ld_scale16_paired_b64 2, -4      ; encoding: 
[0x00,0x00,0x3a,0xcc,0x82,0x88,0x01,0x04]
 
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td 
b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index 7968d14be6592..265c2e99f52d6 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -804,8 +804,8 @@ class ROCDL_WMMA_ModsAll_Diff_IntrOp<string mnemonic, Type 
AB, Type C, Type D> :
   }];
 }
 
-class ROCDL_WMMA_ModsAB_IntrOp<string mnemonic, Type AB, Type CD> : 
ROCDL_IntrOp<mnemonic,
-    [0], [1], [], 1, 0, 0, 0, [0, 2, 5, 6], ["signA", "signB", 
"reuseA","reuseB"]>,
+class ROCDL_WMMA_ModsABClamp_IntrOp<string mnemonic, Type AB, Type CD> : 
ROCDL_IntrOp<mnemonic,
+    [0], [1], [], 1, 0, 0, 0, [0, 2, 5, 6, 7], ["signA", "signB", 
"reuseA","reuseB", "clamp"]>,
   Arguments<(ins
              DefaultValuedAttr<I1Attr, "0">:$signA,
              LLVM_ScalarOrVectorOf<AB>:$a,
@@ -813,7 +813,8 @@ class ROCDL_WMMA_ModsAB_IntrOp<string mnemonic, Type AB, 
Type CD> : ROCDL_IntrOp
              LLVM_ScalarOrVectorOf<AB>:$b,
              LLVM_ScalarOrVectorOf<CD>:$c,
              DefaultValuedAttr<I1Attr, "0">:$reuseA,
-             DefaultValuedAttr<I1Attr, "0">:$reuseB)> {
+             DefaultValuedAttr<I1Attr, "0">:$reuseB,
+             DefaultValuedAttr<I1Attr, "0">:$clamp)> {
   let results = (outs LLVM_ScalarOrVectorOf<CD>:$res);
   let assemblyFormat = [{
     $a `,` $b `,` $c attr-dict `:` functional-type(operands, $res)
@@ -906,7 +907,7 @@ def ROCDL_wmma_f16_16x16x128_fp8_fp8 : 
ROCDL_WMMA_ModsC_IntrOp<"wmma.f16.16x16x1
 def ROCDL_wmma_f16_16x16x128_fp8_bf8 : 
ROCDL_WMMA_ModsC_IntrOp<"wmma.f16.16x16x128.fp8_bf8", AnyInteger, F16>;
 def ROCDL_wmma_f16_16x16x128_bf8_fp8 : 
ROCDL_WMMA_ModsC_IntrOp<"wmma.f16.16x16x128.bf8_fp8", AnyInteger, F16>;
 def ROCDL_wmma_f16_16x16x128_bf8_bf8 : 
ROCDL_WMMA_ModsC_IntrOp<"wmma.f16.16x16x128.bf8_bf8", AnyInteger, F16>;
-def ROCDL_wmma_i32_16x16x64_iu8 : 
ROCDL_WMMA_ModsAB_IntrOp<"wmma.i32.16x16x64.iu8", AnyInteger, AnyInteger>;
+def ROCDL_wmma_i32_16x16x64_iu8 : 
ROCDL_WMMA_ModsABClamp_IntrOp<"wmma.i32.16x16x64.iu8", AnyInteger, AnyInteger>;
 
 // Scaled wmma intrinsics (available from gfx1250)
 def ROCDL_wmma_scale_f32_16x16x128_f8f6f4   : 
ROCDL_WMMA_Scale_IntrOp<"wmma.scale.f32.16x16x128.f8f6f4", AnyInteger, F32, 
I32>;
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir 
b/mlir/test/Target/LLVMIR/rocdl.mlir
index cc3df8cd05087..dc6a00e19afc3 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -1156,16 +1156,16 @@ llvm.func @rocdl.wmma(%arg0 : vector<8xf32>, %arg1 : 
vector<16 x f16>, %arg2 : v
   %r22.gfx1250 = rocdl.wmma.f16.16x16x128.bf8_bf8 %arg5, %arg5, %arg15 {signA 
= false, signB = false, modC = 0 : i16} : (vector<4xi32>, vector<4xi32>, 
vector<64xf16>) -> vector<64xf16>
 
   // iu8 -> i32
-  // CHECK: call <64 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v64i32.v4i32(i1 
false, <4 x i32> %{{.*}} i1 false, <4 x i32> %{{.*}} <64 x i32> %{{.*}} i1 
false, i1 false)
-  %r23.gfx1250 = rocdl.wmma.i32.16x16x64.iu8 %arg5, %arg5, %arg14 {signA = 
false, signB = false} : (vector<4xi32>, vector<4xi32>, vector<64xi32>) -> 
vector<64xi32>
+  // CHECK: call <64 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v64i32.v4i32(i1 
false, <4 x i32> %{{.*}} i1 false, <4 x i32> %{{.*}} <64 x i32> %{{.*}} i1 
false, i1 false, i1 false)
+  %r23.gfx1250 = rocdl.wmma.i32.16x16x64.iu8 %arg5, %arg5, %arg14 {signA = 
false, signB = false, clamp=false} : (vector<4xi32>, vector<4xi32>, 
vector<64xi32>) -> vector<64xi32>
 
   // Test signA=true, signB=true for iu8 gfx1250
-  // CHECK: call <64 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v64i32.v4i32(i1 
true, <4 x i32> %{{.*}} i1 true, <4 x i32> %{{.*}} <64 x i32> %{{.*}} i1 false, 
i1 false)
-  %r23a.gfx1250 = rocdl.wmma.i32.16x16x64.iu8 %arg5, %arg5, %arg14 {signA = 
true, signB = true} : (vector<4xi32>, vector<4xi32>, vector<64xi32>) -> 
vector<64xi32>
+  // CHECK: call <64 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v64i32.v4i32(i1 
true, <4 x i32> %{{.*}} i1 true, <4 x i32> %{{.*}} <64 x i32> %{{.*}} i1 false, 
i1 false, i1 false)
+  %r23a.gfx1250 = rocdl.wmma.i32.16x16x64.iu8 %arg5, %arg5, %arg14 {signA = 
true, signB = true, clamp=false} : (vector<4xi32>, vector<4xi32>, 
vector<64xi32>) -> vector<64xi32>
 
   // Test signA=true, signB=false, reuseA=true, reuseB=true for iu8 gfx1250
-  // CHECK: call <64 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v64i32.v4i32(i1 
true, <4 x i32> %{{.*}} i1 false, <4 x i32> %{{.*}} <64 x i32> %{{.*}} i1 true, 
i1 true)
-  %r23b.gfx1250 = rocdl.wmma.i32.16x16x64.iu8 %arg5, %arg5, %arg14 {signA = 
true, signB = false, reuseA = true, reuseB = true} : (vector<4xi32>, 
vector<4xi32>, vector<64xi32>) -> vector<64xi32>
+  // CHECK: call <64 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v64i32.v4i32(i1 
true, <4 x i32> %{{.*}} i1 false, <4 x i32> %{{.*}} <64 x i32> %{{.*}} i1 true, 
i1 true, i1 false)
+  %r23b.gfx1250 = rocdl.wmma.i32.16x16x64.iu8 %arg5, %arg5, %arg14 {signA = 
true, signB = false, reuseA = true, reuseB = true, clamp=false} : 
(vector<4xi32>, vector<4xi32>, vector<64xi32>) -> vector<64xi32>
 
   // Test signA=true, signB=true with modC=1 for f32 gfx1250
   // CHECK: call <4 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v4f32.v16f32(i1 
true, <16 x float> %{{.*}} i1 true, <16 x float> %{{.*}} i16 1, <4 x float> 
%{{.*}} i1 false, i1 false)

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to