llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-llvm-analysis @llvm/pr-subscribers-clang-codegen Author: Shilei Tian (shiltian) <details> <summary>Changes</summary> --- Patch is 87.70 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/174310.diff 18 Files Affected: - (modified) clang/include/clang/Basic/BuiltinsAMDGPU.def (+2-2) - (modified) clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp (+14) - (modified) clang/lib/Sema/SemaAMDGPU.cpp (+35) - (modified) clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-wmma-w32.cl (+24-2) - (modified) clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-wmma-w32-param.cl (+6) - (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+23-4) - (modified) llvm/lib/IR/AutoUpgrade.cpp (+57) - (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+7-5) - (modified) llvm/lib/Target/AMDGPU/VOP3PInstructions.td (+9-9) - (modified) llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll (+6-6) - (added) llvm/test/Bitcode/amdgpu-wmma-clamp-upgrade.ll (+25) - (modified) llvm/test/CodeGen/AMDGPU/wmma-coececution-valu-hazards.mir (+26-26) - (modified) llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx1250-w32.mir (+20-20) - (modified) llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32.s (+10) - (modified) llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32_err.s (-6) - (modified) llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_wmma_w32.txt (+6) - (modified) mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td (+5-4) - (modified) mlir/test/Target/LLVMIR/rocdl.mlir (+6-6) ``````````diff diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 24b79c3b69b67..7faf73b7628fe 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -857,7 +857,7 @@ TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x64_fp8_fp8, "V8hV8iV8iIsV8hIbIb", TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x64_fp8_bf8, "V8hV8iV8iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32") TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x64_bf8_fp8, "V8hV8iV8iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32") TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x64_bf8_bf8, "V8hV8iV8iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32") -TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x64_iu8, "V8iIbV8iIbV8iV8iIbIb", "nc", "gfx1250-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x64_iu8, "V8iIbV8iIbV8iV8iIbIb.", "nc", "gfx1250-insts,wavefrontsize32") TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x128_fp8_fp8, "V8hV16iV16iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32") TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x128_fp8_bf8, "V8hV16iV16iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32") TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x128_bf8_fp8, "V8hV16iV16iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32") @@ -885,7 +885,7 @@ TARGET_BUILTIN(__builtin_amdgcn_swmmac_f16_16x16x128_fp8_fp8, "V8hV8iV16iV8hiIbI TARGET_BUILTIN(__builtin_amdgcn_swmmac_f16_16x16x128_fp8_bf8, "V8hV8iV16iV8hiIbIb", "nc", "gfx1250-insts,wavefrontsize32") TARGET_BUILTIN(__builtin_amdgcn_swmmac_f16_16x16x128_bf8_fp8, "V8hV8iV16iV8hiIbIb", "nc", "gfx1250-insts,wavefrontsize32") TARGET_BUILTIN(__builtin_amdgcn_swmmac_f16_16x16x128_bf8_bf8, "V8hV8iV16iV8hiIbIb", "nc", "gfx1250-insts,wavefrontsize32") -TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x128_iu8, "V8iIbV8iIbV16iV8iiIbIb", "nc", "gfx1250-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x128_iu8, "V8iIbV8iIbV16iV8iiIbIb.", "nc", "gfx1250-insts,wavefrontsize32") TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x64_f16, "V8fIbV16hIbV32hV8fiIbIb", "nc", "gfx1250-insts,wavefrontsize32") TARGET_BUILTIN(__builtin_amdgcn_swmmac_f16_16x16x64_f16, "V8hIbV16hIbV32hV8hiIbIb", "nc", "gfx1250-insts,wavefrontsize32") diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp index eabdc370da6b4..a8a5bc348f00c 100644 --- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp @@ -1665,6 +1665,20 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, if (AppendFalseForOpselArg) Args.push_back(Builder.getFalse()); + // Handle the optional clamp argument of the following two builtins. + if (BuiltinID == AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x64_iu8) { + if (Args.size() == 7) + Args.push_back(Builder.getFalse()); + assert(Args.size() == 8 && "Expected 8 arguments"); + Args[7] = Builder.CreateZExtOrTrunc(Args[7], Builder.getInt1Ty()); + } else if (BuiltinID == + AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x128_iu8) { + if (Args.size() == 8) + Args.push_back(Builder.getFalse()); + assert(Args.size() == 9 && "Expected 9 arguments"); + Args[8] = Builder.CreateZExtOrTrunc(Args[8], Builder.getInt1Ty()); + } + SmallVector<llvm::Type *, 6> ArgTypes; if (NeedReturnType) ArgTypes.push_back(ConvertType(E->getType())); diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp index cece22092bb14..d445d579baa96 100644 --- a/clang/lib/Sema/SemaAMDGPU.cpp +++ b/clang/lib/Sema/SemaAMDGPU.cpp @@ -1,3 +1,4 @@ + //===------ SemaAMDGPU.cpp ------- AMDGPU target-specific routines --------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. @@ -255,6 +256,40 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID, (SemaRef.BuiltinConstantArg(TheCall, ArgCount, Result)) || (SemaRef.BuiltinConstantArg(TheCall, (ArgCount - 1), Result)); } + case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x64_iu8: + case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x128_iu8: { + if (BuiltinID == AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x64_iu8) { + if (SemaRef.checkArgCountRange(TheCall, 7, 8)) + return true; + if (TheCall->getNumArgs() == 7) + return false; + } else if (BuiltinID == + AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x128_iu8) { + if (SemaRef.checkArgCountRange(TheCall, 8, 9)) + return true; + if (TheCall->getNumArgs() == 8) + return false; + } + // Check if the last argument (clamp operand) is a constant and is + // convertible to bool. + Expr *ClampArg = TheCall->getArg(TheCall->getNumArgs() - 1); + // 1) Ensure clamp argument is a constant expression + llvm::APSInt ClampValue; + if (!SemaRef.VerifyIntegerConstantExpression(ClampArg, &ClampValue) + .isUsable()) + return true; + // 2) Check if the argument can be converted to bool type + if (!SemaRef.Context.hasSameType(ClampArg->getType(), + SemaRef.Context.BoolTy)) { + // Try to convert to bool + QualType BoolTy = SemaRef.Context.BoolTy; + ExprResult ClampExpr(ClampArg); + SemaRef.CheckSingleAssignmentConstraints(BoolTy, ClampExpr); + if (ClampExpr.isInvalid()) + return true; + } + return false; + } default: return false; } diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-wmma-w32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-wmma-w32.cl index afad4bb15b528..a463ba7ab41c3 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-wmma-w32.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-wmma-w32.cl @@ -148,7 +148,7 @@ void test_amdgcn_wmma_f16_16x16x64_bf8_bf8(global v8h* out, v8i a, v8i b, v8h c) // CHECK-GFX1250-LABEL: @test_amdgcn_wmma_i32_16x16x64_iu8( // CHECK-GFX1250-NEXT: entry: -// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 false, <8 x i32> [[A:%.*]], i1 false, <8 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false, i1 true) +// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 false, <8 x i32> [[A:%.*]], i1 false, <8 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false, i1 true, i1 false) // CHECK-GFX1250-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA8]] // CHECK-GFX1250-NEXT: ret void // @@ -157,6 +157,17 @@ void test_amdgcn_wmma_i32_16x16x64_iu8(global v8i* out, v8i a, v8i b, v8i c) *out = __builtin_amdgcn_wmma_i32_16x16x64_iu8(0, a, 0, b, c, false, true); } +// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_i32_16x16x64_iu8_clamp( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 false, <8 x i32> [[A:%.*]], i1 false, <8 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false, i1 true, i1 true) +// CHECK-GFX1250-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA8]] +// CHECK-GFX1250-NEXT: ret void +// +void test_amdgcn_wmma_i32_16x16x64_iu8_clamp(global v8i* out, v8i a, v8i b, v8i c) +{ + *out = __builtin_amdgcn_wmma_i32_16x16x64_iu8(0, a, 0, b, c, false, true, 1); +} + // CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f32_16x16x128_f8f6f4( // CHECK-GFX1250-NEXT: entry: // CHECK-GFX1250-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[B:%.*]], <16 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> @@ -459,7 +470,7 @@ void test_amdgcn_swmmac_f16_16x16x128_bf8_bf8(global v8h* out, v8i a, v16i b, v8 // CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_i32_16x16x128_iu8( // CHECK-GFX1250-NEXT: entry: -// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x128.iu8.v8i32.v8i32.v16i32.i32(i1 false, <8 x i32> [[A:%.*]], i1 false, <16 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i32 [[INDEX:%.*]], i1 false, i1 true) +// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x128.iu8.v8i32.v8i32.v16i32.i32(i1 false, <8 x i32> [[A:%.*]], i1 false, <16 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i32 [[INDEX:%.*]], i1 false, i1 true, i1 false) // CHECK-GFX1250-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA8]] // CHECK-GFX1250-NEXT: ret void // @@ -468,6 +479,17 @@ void test_amdgcn_swmmac_i32_16x16x128_iu8(global v8i* out, v8i a, v16i b, v8i c, *out = __builtin_amdgcn_swmmac_i32_16x16x128_iu8(0, a, 0, b, c, index, false, true); } +// CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_i32_16x16x128_iu8_clamp( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x128.iu8.v8i32.v8i32.v16i32.i32(i1 false, <8 x i32> [[A:%.*]], i1 false, <16 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i32 [[INDEX:%.*]], i1 false, i1 true, i1 true) +// CHECK-GFX1250-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA8]] +// CHECK-GFX1250-NEXT: ret void +// +void test_amdgcn_swmmac_i32_16x16x128_iu8_clamp(global v8i* out, v8i a, v16i b, v8i c, int index) +{ + *out = __builtin_amdgcn_swmmac_i32_16x16x128_iu8(0, a, 0, b, c, index, false, true, 1); +} + // CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_f32_16x16x64_f16( // CHECK-GFX1250-NEXT: entry: // CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.f16.v8f32.v16f16.v32f16.i32(i1 false, <16 x half> [[A:%.*]], i1 false, <32 x half> [[B:%.*]], <8 x float> [[C:%.*]], i32 [[INDEX:%.*]], i1 false, i1 true) diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-wmma-w32-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-wmma-w32-param.cl index 49ef2e571740c..e3e0cc3f596c7 100644 --- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-wmma-w32-param.cl +++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-wmma-w32-param.cl @@ -112,6 +112,9 @@ void test_amdgcn_wmma_i32_16x16x64_iu8(global v8i* out, v8i a, v8i b, v8i c, int *out = __builtin_amdgcn_wmma_i32_16x16x64_iu8(0, a, mod, b, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_i32_16x16x64_iu8' must be a constant integer}} *out = __builtin_amdgcn_wmma_i32_16x16x64_iu8(0, a, 0, b, c, mod, false); // expected-error {{'__builtin_amdgcn_wmma_i32_16x16x64_iu8' must be a constant integer}} *out = __builtin_amdgcn_wmma_i32_16x16x64_iu8(0, a, 0, b, c, false, mod); // expected-error {{'__builtin_amdgcn_wmma_i32_16x16x64_iu8' must be a constant integer}} + *out = __builtin_amdgcn_wmma_i32_16x16x64_iu8(0, a, 0, b, c, false, true, 32.0f); // expected-error {{integer constant expression must have integer type, not 'double'}} + *out = __builtin_amdgcn_wmma_i32_16x16x64_iu8(0, a, 0, b, c, false, true, mod); // expected-error {{expression is not an integer constant expression}} + *out = __builtin_amdgcn_wmma_i32_16x16x64_iu8(0, a, 0, b, c, false, true, true, 32.0f); // expected-error {{too many arguments to function call, expected at most 8, have 9}} } void test_amdgcn_wmma_f32_16x16x128_f8f6f4(global v8f* out, v16i a, v16i b, v8f c, int mod) @@ -286,6 +289,9 @@ void test_amdgcn_swmmac_i32_16x16x128_iu8(global v8i* out, v8i a, v16i b, v8i c, *out = __builtin_amdgcn_swmmac_i32_16x16x128_iu8(0, a, mod, b, c, index, false, false); // expected-error {{'__builtin_amdgcn_swmmac_i32_16x16x128_iu8' must be a constant integer}} *out = __builtin_amdgcn_swmmac_i32_16x16x128_iu8(0, a, 0, b, c, index, mod, false); // expected-error {{'__builtin_amdgcn_swmmac_i32_16x16x128_iu8' must be a constant integer}} *out = __builtin_amdgcn_swmmac_i32_16x16x128_iu8(0, a, 0, b, c, index, false, mod); // expected-error {{'__builtin_amdgcn_swmmac_i32_16x16x128_iu8' must be a constant integer}} + *out = __builtin_amdgcn_swmmac_i32_16x16x128_iu8(0, a, 0, b, c, index, false, true, 32.0f); // expected-error {{integer constant expression must have integer type, not 'double'}} + *out = __builtin_amdgcn_swmmac_i32_16x16x128_iu8(0, a, 0, b, c, index, false, true, mod); // expected-error {{expression is not an integer constant expression}} + *out = __builtin_amdgcn_swmmac_i32_16x16x128_iu8(0, a, 0, b, c, index, false, true, true, 32.0f); // expected-error {{too many arguments to function call, expected at most 9, have 10}} } void test_amdgcn_swmmac_f32_16x16x64_f16(global v8f* out, v16h a, v32h b, v8f c, int index, int mod) diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index f7d4b51c75168..2fb25bac43756 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -3902,7 +3902,7 @@ def int_amdgcn_global_store_async_from_lds_b128 : ClangBuiltin<"__builtin_amdgcn_global_store_async_from_lds_b128">, AMDGPUAsyncGlobalStoreFromLDS; // WMMA intrinsics. -class AMDGPUWmmaIntrinsicModsAB<LLVMType AB, LLVMType CD> : +class AMDGPUWmmaIntrinsicModsABClamp<LLVMType AB, LLVMType CD> : Intrinsic< [CD], // %D [ @@ -3913,8 +3913,9 @@ class AMDGPUWmmaIntrinsicModsAB<LLVMType AB, LLVMType CD> : LLVMMatchType<0>, // %C llvm_i1_ty, // matrix_a_reuse llvm_i1_ty, // matrix_b_reuse + llvm_i1_ty, // %clamp ], - [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>, + [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>, ImmArg<ArgIndex<7>>, IntrWillReturn, IntrNoCallback, IntrNoFree, IntrNoCreateUndefOrPoison] >; @@ -4079,7 +4080,7 @@ def int_amdgcn_wmma_f32_16x16x128_fp8_fp8 : AMDGPUWmmaIntrinsicModsC<llvm_anyint def int_amdgcn_wmma_f32_16x16x128_fp8_bf8 : AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>; def int_amdgcn_wmma_f32_16x16x128_bf8_fp8 : AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>; def int_amdgcn_wmma_f32_16x16x128_bf8_bf8 : AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>; -def int_amdgcn_wmma_i32_16x16x64_iu8 : AMDGPUWmmaIntrinsicModsAB<llvm_anyint_ty, llvm_anyint_ty>; +def int_amdgcn_wmma_i32_16x16x64_iu8 : AMDGPUWmmaIntrinsicModsABClamp<llvm_anyint_ty, llvm_anyint_ty>; def int_amdgcn_wmma_f32_16x16x128_f8f6f4 : AMDGPUWmmaIntrinsicModsC_MatrixFMT; def int_amdgcn_wmma_scale_f32_16x16x128_f8f6f4 : AMDGPUWmmaScaleIntrinsicModsC<llvm_i32_ty>; def int_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4 : AMDGPUWmmaScaleIntrinsicModsC<llvm_i64_ty>; @@ -4105,6 +4106,24 @@ class AMDGPUSWmmacIntrinsicABIdx<LLVMType A, LLVMType B, LLVMType CD, LLVMType I ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>, ImmArg<ArgIndex<7>>] >; +class AMDGPUSWmmacIntrinsicABIdxClamp<LLVMType A, LLVMType B, LLVMType CD, LLVMType Index> : + Intrinsic< + [CD], // %D + [ + llvm_i1_ty, // %A_mod: 0 - none, 1 - neg + A, // %A + llvm_i1_ty, // %B_mod: 0 - none, 1 - neg + B, // %B + LLVMMatchType<0>, // %C + Index, // %Sparsity index for A + llvm_i1_ty, // matrix_a_reuse + llvm_i1_ty, // matrix_b_reuse + llvm_i1_ty, // %clamp + ], + [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCreateUndefOrPoison, + ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>, ImmArg<ArgIndex<7>>, ImmArg<ArgIndex<8>>] +>; + defset list<Intrinsic> AMDGPUSWMMACIntrinsicsGFX1250 = { def int_amdgcn_swmmac_f32_16x16x64_f16 : AMDGPUSWmmacIntrinsicABIdx<llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyint_ty>; def int_amdgcn_swmmac_f32_16x16x64_bf16 : AMDGPUSWmmacIntrinsicABIdx<llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyint_ty>; @@ -4119,7 +4138,7 @@ def int_amdgcn_swmmac_f16_16x16x128_fp8_fp8 : AMDGPUSWmmacIntrinsicIdxReuse<llvm def int_amdgcn_swmmac_f16_16x16x128_fp8_bf8 : AMDGPUSWmmacIntrinsicIdxReuse<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>; def int_amdgcn_swmmac_f16_16x16x128_bf8_fp8 : AMDGPUSWmmacIntrinsicIdxReuse<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>; def int_amdgcn_swmmac_f16_16x16x128_bf8_bf8 : AMDGPUSWmmacIntrinsicIdxReuse<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>; -def int_amdgcn_swmmac_i32_16x16x128_iu8 : AMDGPUSWmmacIntrinsicABIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty>; +def int_amdgcn_swmmac_i32_16x16x128_iu8 : AMDGPUSWmmacIntrinsicABIdxClamp<llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty>; } diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index cbb7b6ee4f3f5..d8b54c396df28 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -32,6 +32,7 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicsAArch64.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IntrinsicsARM.h" #include "llvm/IR/IntrinsicsNVPTX.h" #include "llvm/IR/IntrinsicsRISCV.h" @@ -1284,6 +1285,18 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, break; // No other 'amdgcn.atomic.*' } + // Legacy wmma iu intrinsics without the optional clamp operand. + if (F->getIntrinsicID() == Intrinsic::amdgcn_wmma_i32_16x16x64_iu8 && + F->arg_size() == 7) { + NewFn = nullptr; + return true; + } + if (F->getIntrinsicID() == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8 && + F->arg_size() == 8) { + NewFn = nullptr; + return true; + } + if (Name.consume_front("ds.") || Name.consume_front("global.atomic.") || Name.consume_front("flat.atomic.")) { if (Name.starts_with("fadd") || @@ -4620,6 +4633,50 @@ static Value *upgradeARMIntrinsicCall(StringRef Name, CallBase *CI, Function *F, // static Value *upgradeAMDGCNIntrinsicCall(StringRef Name, CallBase *CI, Function *F, IRBuilder<> &Builder) { + // Legacy WMMA iu intrinsics missed the optional clamp operand. Append clamp=0 + // for compatibility. + auto UpgradeLegacyWMMAIUIntrinsicCall = + [](Function *F, CallBase *CI, IRBuilder<> &Builder, + ArrayRef<Type *> OverloadTys) -> Value * { + // Prepare arguments, append clamp=0 for compatibility + SmallVector<Value *, 10> Args(CI->args().begin(), CI->args().end()); + Args.push_back(Builder.getFalse()); + + // Insert the declaration for the right overload types + Function *NewDecl = Intrinsic::getOrInsertDeclaration( + F->getParent(), F->getIntrinsicID(), OverloadTys); + + // Copy operand bundles if any + SmallVector<OperandBundleDef, 1> Bundles; + CI->getOperandBundlesAsDefs(Bundles); + + // Create the new call and copy calling properties + auto *NewCall = cast<CallInst>(Builder.CreateCall(NewDecl, Args, Bundles)); + NewCall->setTailCallKind(cast<CallInst>(CI)->getTailCallKind()); + NewCall->setCallingConv(CI->getCallingConv()); + NewCall->setAttributes(CI->getAttributes()); + NewCall->setDebugLoc(CI->getDebugLoc()); + NewCall->copyMetadata(*CI); + return NewCall; + }; + + if (F->getIntrinsicID() == Intrinsic::amdgcn_wmma_i32_16x16x64_iu8) { + assert(CI->arg_size() == 7 && "Legacy int_amdgcn_wmma_i32_16x16x64_iu8 " + "intrinsic should have 7 arguments"); + Type *T1 = CI->getArgOperand(4)->getType(); + Type *T2 = CI->getArgOperand(1)->getType(); + return UpgradeLegacyWMMAIUIntrinsicCall(F, CI, Builder, {T1, T2}); + } + if (F->getIntrinsicID() == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8) { + assert(CI->arg_size() == 8 && "Legacy int_amdgcn_swmmac_i32_16x16x128_iu8 " + "intrinsic should have 8 arguments"); + Type *T1 = CI->getArgOperand(4)->getType(); + Type *T2 = CI->getArgOperand(1)->getType(); + Type *T3 = CI->getArgOperand(3)->getType(); + Type *T4 = CI->getArgOperand(5)->getType(); + return UpgradeLegacyWMMAIUIntrinsicCall(F, CI, Builder, {T1, T2, T3, T4}); + } + AtomicRMWInst::BinOp RMWOp = StringSwitch<AtomicRMWInst::BinOp>(... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/174310 _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
