[PATCH] D97358: [X86] Support amx-bf16 intrinsic.
xiangzhangllvm added a comment. +1 first, didn't see key problems. Comment at: clang/lib/Headers/amxintrin.h:326 +__DEFAULT_FN_ATTRS_BF16 +static void __tile_tdpbf16ps(__tile1024i *dst, __tile1024i src1, + __tile1024i src2) { yubing wrote: > Should we align this with "tile_dpbssd" by renaming it wth "tile_dpbf16ps"? Yes, "t" already means "tile" Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D97358/new/ https://reviews.llvm.org/D97358 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D97358: [X86] Support amx-bf16 intrinsic.
yubing added inline comments. Comment at: clang/lib/Headers/amxintrin.h:326 +__DEFAULT_FN_ATTRS_BF16 +static void __tile_tdpbf16ps(__tile1024i *dst, __tile1024i src1, + __tile1024i src2) { Should we align this with "tile_dpbssd" by renaming it wth "tile_dpbf16ps"? Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D97358/new/ https://reviews.llvm.org/D97358 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D97358: [X86] Support amx-bf16 intrinsic.
This revision was landed with ongoing or failed builds. This revision was automatically updated to reflect the committed changes. Closed by commit rG4bc7c8631ad6: [X86] Support amx-bf16 intrinsic. (authored by LiuChen3). Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D97358/new/ https://reviews.llvm.org/D97358 Files: clang/include/clang/Basic/BuiltinsX86_64.def clang/lib/Headers/amxintrin.h clang/test/CodeGen/X86/amx_api.c llvm/include/llvm/IR/IntrinsicsX86.td llvm/lib/Target/X86/X86ExpandPseudo.cpp llvm/lib/Target/X86/X86ISelDAGToDAG.cpp llvm/lib/Target/X86/X86InstrAMX.td llvm/lib/Target/X86/X86LowerAMXType.cpp llvm/lib/Target/X86/X86PreTileConfig.cpp llvm/lib/Target/X86/X86RegisterInfo.cpp llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll Index: llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll === --- llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll +++ llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-tile -mattr=+avx512f -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-int8,+amx-bf16,+avx512f -verify-machineinstrs | FileCheck %s define void @test_amx(i8* %pointer, i8* %base, i64 %stride) { ; CHECK-LABEL: test_amx: @@ -22,6 +22,7 @@ ; CHECK-NEXT:tdpbsud %tmm2, %tmm1, %tmm0 ; CHECK-NEXT:tdpbusd %tmm2, %tmm1, %tmm0 ; CHECK-NEXT:tdpbuud %tmm2, %tmm1, %tmm0 +; CHECK-NEXT:tdpbf16ps %tmm2, %tmm1, %tmm0 ; CHECK-NEXT:tilestored %tmm0, (%rdi,%rdx) ; CHECK-NEXT:tilerelease ; CHECK-NEXT:vzeroupper @@ -33,7 +34,8 @@ %d1 = call x86_amx @llvm.x86.tdpbsud.internal(i16 8, i16 8, i16 8, x86_amx %d0, x86_amx %a, x86_amx %b) %d2 = call x86_amx @llvm.x86.tdpbusd.internal(i16 8, i16 8, i16 8, x86_amx %d1, x86_amx %a, x86_amx %b) %d3 = call x86_amx @llvm.x86.tdpbuud.internal(i16 8, i16 8, i16 8, x86_amx %d2, x86_amx %a, x86_amx %b) - call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %d3) + %d4 = call x86_amx @llvm.x86.tdpbf16ps.internal(i16 8, i16 8, i16 8, x86_amx %d3, x86_amx %a, x86_amx %b) + call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %d4) ret void } @@ -44,4 +46,5 @@ declare x86_amx @llvm.x86.tdpbsud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) declare x86_amx @llvm.x86.tdpbusd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) declare x86_amx @llvm.x86.tdpbuud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) +declare x86_amx @llvm.x86.tdpbf16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) Index: llvm/lib/Target/X86/X86RegisterInfo.cpp === --- llvm/lib/Target/X86/X86RegisterInfo.cpp +++ llvm/lib/Target/X86/X86RegisterInfo.cpp @@ -888,6 +888,7 @@ case X86::PTDPBUSDV: case X86::PTDPBUUDV: case X86::PTILEZEROV: + case X86::PTDPBF16PSV: MachineOperand = MI->getOperand(1); MachineOperand = MI->getOperand(2); ShapeT Shape(, , MRI); Index: llvm/lib/Target/X86/X86PreTileConfig.cpp === --- llvm/lib/Target/X86/X86PreTileConfig.cpp +++ llvm/lib/Target/X86/X86PreTileConfig.cpp @@ -159,6 +159,7 @@ case X86::PTDPBUSDV: case X86::PTDPBUUDV: case X86::PTILEZEROV: + case X86::PTDPBF16PSV: MachineOperand = const_cast(MI.getOperand(1)); MachineOperand = const_cast(MI.getOperand(2)); ShapeT Shape(, , MRI); @@ -256,6 +257,7 @@ case X86::PTDPBUSDV: case X86::PTDPBUUDV: case X86::PTILEZEROV: + case X86::PTDPBF16PSV: return true; } } Index: llvm/lib/Target/X86/X86LowerAMXType.cpp === --- llvm/lib/Target/X86/X86LowerAMXType.cpp +++ llvm/lib/Target/X86/X86LowerAMXType.cpp @@ -70,7 +70,8 @@ case Intrinsic::x86_tdpbssd_internal: case Intrinsic::x86_tdpbsud_internal: case Intrinsic::x86_tdpbusd_internal: - case Intrinsic::x86_tdpbuud_internal: { + case Intrinsic::x86_tdpbuud_internal: + case Intrinsic::x86_tdpbf16ps_internal: { switch (OpNo) { case 3: Row = II->getArgOperand(0); Index: llvm/lib/Target/X86/X86InstrAMX.td === --- llvm/lib/Target/X86/X86InstrAMX.td +++ llvm/lib/Target/X86/X86InstrAMX.td @@ -138,6 +138,16 @@ "tdpbf16ps\t{$src3, $src2, $dst|$dst, $src2, $src3}", []>, VEX_4V, T8XS; +// Pseduo instruction for RA. +let Constraints = "$src4 = $dst" in + def PTDPBF16PSV : PseudoI<(outs TILE: $dst), (ins GR16:$src1, + GR16:$src2, GR16:$src3,
[PATCH] D97358: [X86] Support amx-bf16 intrinsic.
LiuChen3 added a comment. I don't know why pre-merge-checks failed. I can check-all successfully locally in redhat8. I don't have debian mainchine to reproduce this problem. Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D97358/new/ https://reviews.llvm.org/D97358 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D97358: [X86] Support amx-bf16 intrinsic.
pengfei accepted this revision. pengfei added a comment. This revision is now accepted and ready to land. LGTM. Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D97358/new/ https://reviews.llvm.org/D97358 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D97358: [X86] Support amx-bf16 intrinsic.
LiuChen3 updated this revision to Diff 326002. LiuChen3 added a comment. Address Pengfei and Yuanke's comments. We don't need more tile type. Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D97358/new/ https://reviews.llvm.org/D97358 Files: clang/include/clang/Basic/BuiltinsX86_64.def clang/lib/Headers/amxintrin.h clang/test/CodeGen/X86/amx_api.c llvm/include/llvm/IR/IntrinsicsX86.td llvm/lib/Target/X86/X86ExpandPseudo.cpp llvm/lib/Target/X86/X86ISelDAGToDAG.cpp llvm/lib/Target/X86/X86InstrAMX.td llvm/lib/Target/X86/X86LowerAMXType.cpp llvm/lib/Target/X86/X86PreTileConfig.cpp llvm/lib/Target/X86/X86RegisterInfo.cpp llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll Index: llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll === --- llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll +++ llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-tile -mattr=+avx512f -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-int8,+amx-bf16,+avx512f -verify-machineinstrs | FileCheck %s define void @test_amx(i8* %pointer, i8* %base, i64 %stride) { ; CHECK-LABEL: test_amx: @@ -22,6 +22,7 @@ ; CHECK-NEXT:tdpbsud %tmm2, %tmm1, %tmm0 ; CHECK-NEXT:tdpbusd %tmm2, %tmm1, %tmm0 ; CHECK-NEXT:tdpbuud %tmm2, %tmm1, %tmm0 +; CHECK-NEXT:tdpbf16ps %tmm2, %tmm1, %tmm0 ; CHECK-NEXT:tilestored %tmm0, (%rdi,%rdx) ; CHECK-NEXT:tilerelease ; CHECK-NEXT:vzeroupper @@ -33,7 +34,8 @@ %d1 = call x86_amx @llvm.x86.tdpbsud.internal(i16 8, i16 8, i16 8, x86_amx %d0, x86_amx %a, x86_amx %b) %d2 = call x86_amx @llvm.x86.tdpbusd.internal(i16 8, i16 8, i16 8, x86_amx %d1, x86_amx %a, x86_amx %b) %d3 = call x86_amx @llvm.x86.tdpbuud.internal(i16 8, i16 8, i16 8, x86_amx %d2, x86_amx %a, x86_amx %b) - call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %d3) + %d4 = call x86_amx @llvm.x86.tdpbf16ps.internal(i16 8, i16 8, i16 8, x86_amx %d3, x86_amx %a, x86_amx %b) + call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %d4) ret void } @@ -44,4 +46,5 @@ declare x86_amx @llvm.x86.tdpbsud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) declare x86_amx @llvm.x86.tdpbusd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) declare x86_amx @llvm.x86.tdpbuud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) +declare x86_amx @llvm.x86.tdpbf16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) Index: llvm/lib/Target/X86/X86RegisterInfo.cpp === --- llvm/lib/Target/X86/X86RegisterInfo.cpp +++ llvm/lib/Target/X86/X86RegisterInfo.cpp @@ -888,6 +888,7 @@ case X86::PTDPBUSDV: case X86::PTDPBUUDV: case X86::PTILEZEROV: + case X86::PTDPBF16PSV: MachineOperand = MI->getOperand(1); MachineOperand = MI->getOperand(2); ShapeT Shape(, , MRI); Index: llvm/lib/Target/X86/X86PreTileConfig.cpp === --- llvm/lib/Target/X86/X86PreTileConfig.cpp +++ llvm/lib/Target/X86/X86PreTileConfig.cpp @@ -159,6 +159,7 @@ case X86::PTDPBUSDV: case X86::PTDPBUUDV: case X86::PTILEZEROV: + case X86::PTDPBF16PSV: MachineOperand = const_cast(MI.getOperand(1)); MachineOperand = const_cast(MI.getOperand(2)); ShapeT Shape(, , MRI); @@ -256,6 +257,7 @@ case X86::PTDPBUSDV: case X86::PTDPBUUDV: case X86::PTILEZEROV: + case X86::PTDPBF16PSV: return true; } } Index: llvm/lib/Target/X86/X86LowerAMXType.cpp === --- llvm/lib/Target/X86/X86LowerAMXType.cpp +++ llvm/lib/Target/X86/X86LowerAMXType.cpp @@ -70,7 +70,8 @@ case Intrinsic::x86_tdpbssd_internal: case Intrinsic::x86_tdpbsud_internal: case Intrinsic::x86_tdpbusd_internal: - case Intrinsic::x86_tdpbuud_internal: { + case Intrinsic::x86_tdpbuud_internal: + case Intrinsic::x86_tdpbf16ps_internal: { switch (OpNo) { case 3: Row = II->getArgOperand(0); Index: llvm/lib/Target/X86/X86InstrAMX.td === --- llvm/lib/Target/X86/X86InstrAMX.td +++ llvm/lib/Target/X86/X86InstrAMX.td @@ -138,6 +138,16 @@ "tdpbf16ps\t{$src3, $src2, $dst|$dst, $src2, $src3}", []>, VEX_4V, T8XS; +// Pseduo instruction for RA. +let Constraints = "$src4 = $dst" in + def PTDPBF16PSV : PseudoI<(outs TILE: $dst), (ins GR16:$src1, + GR16:$src2, GR16:$src3, TILE:$src4, + TILE:$src5, TILE:$src6), +
[PATCH] D97358: [X86] Support amx-bf16 intrinsic.
pengfei added inline comments. Comment at: clang/lib/Headers/amxintrin.h:283 +typedef struct __tile1024bf16_str { + const unsigned short row; LiuChen3 wrote: > LuoYuanke wrote: > > pengfei wrote: > > > Is there much value to differentiate the type? We are using the same AMX > > > type in the builtins. What do you think? @LuoYuanke > > My first though is that we can reuse __tile1024i for bf16 tile for 2 > > reasons. > > 1. We don't access the element of the tile. > > 2. The destination element of amx-int8 is int32 and the destination element > > of amx-bf16 is float32, the element size is the same. > Does this means that user need to do explicitly type conversion? We don't allow user to use the tile in the structure. User should always use load/store intrinsic to pass there own data. Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D97358/new/ https://reviews.llvm.org/D97358 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D97358: [X86] Support amx-bf16 intrinsic.
LiuChen3 added inline comments. Comment at: clang/lib/Headers/amxintrin.h:283 +typedef struct __tile1024bf16_str { + const unsigned short row; LuoYuanke wrote: > pengfei wrote: > > Is there much value to differentiate the type? We are using the same AMX > > type in the builtins. What do you think? @LuoYuanke > My first though is that we can reuse __tile1024i for bf16 tile for 2 reasons. > 1. We don't access the element of the tile. > 2. The destination element of amx-int8 is int32 and the destination element > of amx-bf16 is float32, the element size is the same. Does this means that user need to do explicitly type conversion? Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D97358/new/ https://reviews.llvm.org/D97358 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D97358: [X86] Support amx-bf16 intrinsic.
LuoYuanke added inline comments. Comment at: clang/lib/Headers/amxintrin.h:283 +typedef struct __tile1024bf16_str { + const unsigned short row; pengfei wrote: > Is there much value to differentiate the type? We are using the same AMX type > in the builtins. What do you think? @LuoYuanke My first though is that we can reuse __tile1024i for bf16 tile for 2 reasons. 1. We don't access the element of the tile. 2. The destination element of amx-int8 is int32 and the destination element of amx-bf16 is float32, the element size is the same. Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D97358/new/ https://reviews.llvm.org/D97358 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D97358: [X86] Support amx-bf16 intrinsic.
pengfei added inline comments. Comment at: clang/lib/Headers/amxintrin.h:283 +typedef struct __tile1024bf16_str { + const unsigned short row; Is there much value to differentiate the type? We are using the same AMX type in the builtins. What do you think? @LuoYuanke Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D97358/new/ https://reviews.llvm.org/D97358 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D97358: [X86] Support amx-bf16 intrinsic.
LiuChen3 updated this revision to Diff 325988. LiuChen3 added a comment. Adding back 'avx512f' to amx-tile-basic.ll Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D97358/new/ https://reviews.llvm.org/D97358 Files: clang/include/clang/Basic/BuiltinsX86_64.def clang/lib/Headers/amxintrin.h clang/test/CodeGen/X86/amx_api.c llvm/include/llvm/IR/IntrinsicsX86.td llvm/lib/Target/X86/X86ExpandPseudo.cpp llvm/lib/Target/X86/X86ISelDAGToDAG.cpp llvm/lib/Target/X86/X86InstrAMX.td llvm/lib/Target/X86/X86LowerAMXType.cpp llvm/lib/Target/X86/X86PreTileConfig.cpp llvm/lib/Target/X86/X86RegisterInfo.cpp llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll Index: llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll === --- llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll +++ llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-tile -mattr=+avx512f -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-int8,+amx-bf16,+avx512f -verify-machineinstrs | FileCheck %s define void @test_amx(i8* %pointer, i8* %base, i64 %stride) { ; CHECK-LABEL: test_amx: @@ -22,6 +22,7 @@ ; CHECK-NEXT:tdpbsud %tmm2, %tmm1, %tmm0 ; CHECK-NEXT:tdpbusd %tmm2, %tmm1, %tmm0 ; CHECK-NEXT:tdpbuud %tmm2, %tmm1, %tmm0 +; CHECK-NEXT:tdpbf16ps %tmm2, %tmm1, %tmm0 ; CHECK-NEXT:tilestored %tmm0, (%rdi,%rdx) ; CHECK-NEXT:tilerelease ; CHECK-NEXT:vzeroupper @@ -33,7 +34,8 @@ %d1 = call x86_amx @llvm.x86.tdpbsud.internal(i16 8, i16 8, i16 8, x86_amx %d0, x86_amx %a, x86_amx %b) %d2 = call x86_amx @llvm.x86.tdpbusd.internal(i16 8, i16 8, i16 8, x86_amx %d1, x86_amx %a, x86_amx %b) %d3 = call x86_amx @llvm.x86.tdpbuud.internal(i16 8, i16 8, i16 8, x86_amx %d2, x86_amx %a, x86_amx %b) - call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %d3) + %d4 = call x86_amx @llvm.x86.tdpbf16ps.internal(i16 8, i16 8, i16 8, x86_amx %d3, x86_amx %a, x86_amx %b) + call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %d4) ret void } @@ -44,4 +46,5 @@ declare x86_amx @llvm.x86.tdpbsud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) declare x86_amx @llvm.x86.tdpbusd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) declare x86_amx @llvm.x86.tdpbuud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) +declare x86_amx @llvm.x86.tdpbf16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) Index: llvm/lib/Target/X86/X86RegisterInfo.cpp === --- llvm/lib/Target/X86/X86RegisterInfo.cpp +++ llvm/lib/Target/X86/X86RegisterInfo.cpp @@ -888,6 +888,7 @@ case X86::PTDPBUSDV: case X86::PTDPBUUDV: case X86::PTILEZEROV: + case X86::PTDPBF16PSV: MachineOperand = MI->getOperand(1); MachineOperand = MI->getOperand(2); ShapeT Shape(, , MRI); Index: llvm/lib/Target/X86/X86PreTileConfig.cpp === --- llvm/lib/Target/X86/X86PreTileConfig.cpp +++ llvm/lib/Target/X86/X86PreTileConfig.cpp @@ -159,6 +159,7 @@ case X86::PTDPBUSDV: case X86::PTDPBUUDV: case X86::PTILEZEROV: + case X86::PTDPBF16PSV: MachineOperand = const_cast(MI.getOperand(1)); MachineOperand = const_cast(MI.getOperand(2)); ShapeT Shape(, , MRI); @@ -256,6 +257,7 @@ case X86::PTDPBUSDV: case X86::PTDPBUUDV: case X86::PTILEZEROV: + case X86::PTDPBF16PSV: return true; } } Index: llvm/lib/Target/X86/X86LowerAMXType.cpp === --- llvm/lib/Target/X86/X86LowerAMXType.cpp +++ llvm/lib/Target/X86/X86LowerAMXType.cpp @@ -70,7 +70,8 @@ case Intrinsic::x86_tdpbssd_internal: case Intrinsic::x86_tdpbsud_internal: case Intrinsic::x86_tdpbusd_internal: - case Intrinsic::x86_tdpbuud_internal: { + case Intrinsic::x86_tdpbuud_internal: + case Intrinsic::x86_tdpbf16ps_internal: { switch (OpNo) { case 3: Row = II->getArgOperand(0); Index: llvm/lib/Target/X86/X86InstrAMX.td === --- llvm/lib/Target/X86/X86InstrAMX.td +++ llvm/lib/Target/X86/X86InstrAMX.td @@ -138,6 +138,16 @@ "tdpbf16ps\t{$src3, $src2, $dst|$dst, $src2, $src3}", []>, VEX_4V, T8XS; +// Pseduo instruction for RA. +let Constraints = "$src4 = $dst" in + def PTDPBF16PSV : PseudoI<(outs TILE: $dst), (ins GR16:$src1, + GR16:$src2, GR16:$src3, TILE:$src4, + TILE:$src5, TILE:$src6), +
[PATCH] D97358: [X86] Support amx-bf16 intrinsic.
LiuChen3 created this revision. Herald added subscribers: pengfei, hiraditya. LiuChen3 requested review of this revision. Herald added projects: clang, LLVM. Herald added subscribers: llvm-commits, cfe-commits. Adding support for intrinsics of AMX-BF16. This patch alse fix a bug that AMX-INT8 instructions will be selected with wrong predicate. Repository: rG LLVM Github Monorepo https://reviews.llvm.org/D97358 Files: clang/include/clang/Basic/BuiltinsX86_64.def clang/lib/Headers/amxintrin.h clang/test/CodeGen/X86/amx_api.c llvm/include/llvm/IR/IntrinsicsX86.td llvm/lib/Target/X86/X86ExpandPseudo.cpp llvm/lib/Target/X86/X86ISelDAGToDAG.cpp llvm/lib/Target/X86/X86InstrAMX.td llvm/lib/Target/X86/X86LowerAMXType.cpp llvm/lib/Target/X86/X86PreTileConfig.cpp llvm/lib/Target/X86/X86RegisterInfo.cpp llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll Index: llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll === --- llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll +++ llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll @@ -1,11 +1,14 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-tile -mattr=+avx512f -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-int8,+amx-bf16 -verify-machineinstrs | FileCheck %s define void @test_amx(i8* %pointer, i8* %base, i64 %stride) { ; CHECK-LABEL: test_amx: ; CHECK: # %bb.0: -; CHECK-NEXT:vpxord %zmm0, %zmm0, %zmm0 -; CHECK-NEXT:vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT:xorps %xmm0, %xmm0 +; CHECK-NEXT:movups %xmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT:movups %xmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT:movups %xmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT:movups %xmm0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT:movb $1, -{{[0-9]+}}(%rsp) ; CHECK-NEXT:movb $8, -{{[0-9]+}}(%rsp) ; CHECK-NEXT:movw $8, -{{[0-9]+}}(%rsp) @@ -22,9 +25,9 @@ ; CHECK-NEXT:tdpbsud %tmm2, %tmm1, %tmm0 ; CHECK-NEXT:tdpbusd %tmm2, %tmm1, %tmm0 ; CHECK-NEXT:tdpbuud %tmm2, %tmm1, %tmm0 +; CHECK-NEXT:tdpbf16ps %tmm2, %tmm1, %tmm0 ; CHECK-NEXT:tilestored %tmm0, (%rdi,%rdx) ; CHECK-NEXT:tilerelease -; CHECK-NEXT:vzeroupper ; CHECK-NEXT:retq %c = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8) %a = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %base, i64 %stride) @@ -33,7 +36,8 @@ %d1 = call x86_amx @llvm.x86.tdpbsud.internal(i16 8, i16 8, i16 8, x86_amx %d0, x86_amx %a, x86_amx %b) %d2 = call x86_amx @llvm.x86.tdpbusd.internal(i16 8, i16 8, i16 8, x86_amx %d1, x86_amx %a, x86_amx %b) %d3 = call x86_amx @llvm.x86.tdpbuud.internal(i16 8, i16 8, i16 8, x86_amx %d2, x86_amx %a, x86_amx %b) - call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %d3) + %d4 = call x86_amx @llvm.x86.tdpbf16ps.internal(i16 8, i16 8, i16 8, x86_amx %d3, x86_amx %a, x86_amx %b) + call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %d4) ret void } @@ -44,4 +48,5 @@ declare x86_amx @llvm.x86.tdpbsud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) declare x86_amx @llvm.x86.tdpbusd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) declare x86_amx @llvm.x86.tdpbuud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) +declare x86_amx @llvm.x86.tdpbf16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) Index: llvm/lib/Target/X86/X86RegisterInfo.cpp === --- llvm/lib/Target/X86/X86RegisterInfo.cpp +++ llvm/lib/Target/X86/X86RegisterInfo.cpp @@ -888,6 +888,7 @@ case X86::PTDPBUSDV: case X86::PTDPBUUDV: case X86::PTILEZEROV: + case X86::PTDPBF16PSV: MachineOperand = MI->getOperand(1); MachineOperand = MI->getOperand(2); ShapeT Shape(, , MRI); Index: llvm/lib/Target/X86/X86PreTileConfig.cpp === --- llvm/lib/Target/X86/X86PreTileConfig.cpp +++ llvm/lib/Target/X86/X86PreTileConfig.cpp @@ -159,6 +159,7 @@ case X86::PTDPBUSDV: case X86::PTDPBUUDV: case X86::PTILEZEROV: + case X86::PTDPBF16PSV: MachineOperand = const_cast(MI.getOperand(1)); MachineOperand = const_cast(MI.getOperand(2)); ShapeT Shape(, , MRI); @@ -256,6 +257,7 @@ case X86::PTDPBUSDV: case X86::PTDPBUUDV: case X86::PTILEZEROV: + case X86::PTDPBF16PSV: return true; } } Index: llvm/lib/Target/X86/X86LowerAMXType.cpp === --- llvm/lib/Target/X86/X86LowerAMXType.cpp +++ llvm/lib/Target/X86/X86LowerAMXType.cpp @@ -70,7 +70,8 @@ case Intrinsic::x86_tdpbssd_internal: case Intrinsic::x86_tdpbsud_internal: case Intrinsic::x86_tdpbusd_internal: - case