[PATCH] D97358: [X86] Support amx-bf16 intrinsic.

2021-03-16 Thread Xiang Zhang via Phabricator via cfe-commits
xiangzhangllvm added a comment.

+1 first, didn't see key problems.




Comment at: clang/lib/Headers/amxintrin.h:326
+__DEFAULT_FN_ATTRS_BF16
+static void __tile_tdpbf16ps(__tile1024i *dst, __tile1024i src1,
+ __tile1024i src2) {

yubing wrote:
> Should we align this with "tile_dpbssd" by renaming it wth "tile_dpbf16ps"?
Yes, "t" already means "tile"


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D97358/new/

https://reviews.llvm.org/D97358

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D97358: [X86] Support amx-bf16 intrinsic.

2021-03-16 Thread Bing Yu via Phabricator via cfe-commits
yubing added inline comments.



Comment at: clang/lib/Headers/amxintrin.h:326
+__DEFAULT_FN_ATTRS_BF16
+static void __tile_tdpbf16ps(__tile1024i *dst, __tile1024i src1,
+ __tile1024i src2) {

Should we align this with "tile_dpbssd" by renaming it wth "tile_dpbf16ps"?


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D97358/new/

https://reviews.llvm.org/D97358

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D97358: [X86] Support amx-bf16 intrinsic.

2021-02-24 Thread LiuChen via Phabricator via cfe-commits
This revision was landed with ongoing or failed builds.
This revision was automatically updated to reflect the committed changes.
Closed by commit rG4bc7c8631ad6: [X86] Support amx-bf16 intrinsic. (authored by 
LiuChen3).

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D97358/new/

https://reviews.llvm.org/D97358

Files:
  clang/include/clang/Basic/BuiltinsX86_64.def
  clang/lib/Headers/amxintrin.h
  clang/test/CodeGen/X86/amx_api.c
  llvm/include/llvm/IR/IntrinsicsX86.td
  llvm/lib/Target/X86/X86ExpandPseudo.cpp
  llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
  llvm/lib/Target/X86/X86InstrAMX.td
  llvm/lib/Target/X86/X86LowerAMXType.cpp
  llvm/lib/Target/X86/X86PreTileConfig.cpp
  llvm/lib/Target/X86/X86RegisterInfo.cpp
  llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll

Index: llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll
===
--- llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll
+++ llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-tile -mattr=+avx512f -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-int8,+amx-bf16,+avx512f -verify-machineinstrs | FileCheck %s
 
 define void @test_amx(i8* %pointer, i8* %base, i64 %stride) {
 ; CHECK-LABEL: test_amx:
@@ -22,6 +22,7 @@
 ; CHECK-NEXT:tdpbsud %tmm2, %tmm1, %tmm0
 ; CHECK-NEXT:tdpbusd %tmm2, %tmm1, %tmm0
 ; CHECK-NEXT:tdpbuud %tmm2, %tmm1, %tmm0
+; CHECK-NEXT:tdpbf16ps %tmm2, %tmm1, %tmm0
 ; CHECK-NEXT:tilestored %tmm0, (%rdi,%rdx)
 ; CHECK-NEXT:tilerelease
 ; CHECK-NEXT:vzeroupper
@@ -33,7 +34,8 @@
   %d1 = call x86_amx @llvm.x86.tdpbsud.internal(i16 8, i16 8, i16 8, x86_amx %d0, x86_amx %a, x86_amx %b)
   %d2 = call x86_amx @llvm.x86.tdpbusd.internal(i16 8, i16 8, i16 8, x86_amx %d1, x86_amx %a, x86_amx %b)
   %d3 = call x86_amx @llvm.x86.tdpbuud.internal(i16 8, i16 8, i16 8, x86_amx %d2, x86_amx %a, x86_amx %b)
-  call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %d3)
+  %d4 = call x86_amx @llvm.x86.tdpbf16ps.internal(i16 8, i16 8, i16 8, x86_amx %d3, x86_amx %a, x86_amx %b)
+  call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %d4)
 
   ret void
 }
@@ -44,4 +46,5 @@
 declare x86_amx @llvm.x86.tdpbsud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
 declare x86_amx @llvm.x86.tdpbusd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
 declare x86_amx @llvm.x86.tdpbuud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
+declare x86_amx @llvm.x86.tdpbf16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
 declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx)
Index: llvm/lib/Target/X86/X86RegisterInfo.cpp
===
--- llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -888,6 +888,7 @@
   case X86::PTDPBUSDV:
   case X86::PTDPBUUDV:
   case X86::PTILEZEROV:
+  case X86::PTDPBF16PSV:
 MachineOperand  = MI->getOperand(1);
 MachineOperand  = MI->getOperand(2);
 ShapeT Shape(, , MRI);
Index: llvm/lib/Target/X86/X86PreTileConfig.cpp
===
--- llvm/lib/Target/X86/X86PreTileConfig.cpp
+++ llvm/lib/Target/X86/X86PreTileConfig.cpp
@@ -159,6 +159,7 @@
   case X86::PTDPBUSDV:
   case X86::PTDPBUUDV:
   case X86::PTILEZEROV:
+  case X86::PTDPBF16PSV:
 MachineOperand  = const_cast(MI.getOperand(1));
 MachineOperand  = const_cast(MI.getOperand(2));
 ShapeT Shape(, , MRI);
@@ -256,6 +257,7 @@
   case X86::PTDPBUSDV:
   case X86::PTDPBUUDV:
   case X86::PTILEZEROV:
+  case X86::PTDPBF16PSV:
 return true;
   }
 }
Index: llvm/lib/Target/X86/X86LowerAMXType.cpp
===
--- llvm/lib/Target/X86/X86LowerAMXType.cpp
+++ llvm/lib/Target/X86/X86LowerAMXType.cpp
@@ -70,7 +70,8 @@
   case Intrinsic::x86_tdpbssd_internal:
   case Intrinsic::x86_tdpbsud_internal:
   case Intrinsic::x86_tdpbusd_internal:
-  case Intrinsic::x86_tdpbuud_internal: {
+  case Intrinsic::x86_tdpbuud_internal:
+  case Intrinsic::x86_tdpbf16ps_internal: {
 switch (OpNo) {
 case 3:
   Row = II->getArgOperand(0);
Index: llvm/lib/Target/X86/X86InstrAMX.td
===
--- llvm/lib/Target/X86/X86InstrAMX.td
+++ llvm/lib/Target/X86/X86InstrAMX.td
@@ -138,6 +138,16 @@
   "tdpbf16ps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
   []>, VEX_4V, T8XS;
 
+// Pseduo instruction for RA.
+let Constraints = "$src4 = $dst" in
+  def PTDPBF16PSV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
+ GR16:$src2, GR16:$src3, 

[PATCH] D97358: [X86] Support amx-bf16 intrinsic.

2021-02-24 Thread LiuChen via Phabricator via cfe-commits
LiuChen3 added a comment.

I don't know why pre-merge-checks failed. I can check-all successfully locally 
in redhat8. I don't have  debian mainchine to reproduce this problem.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D97358/new/

https://reviews.llvm.org/D97358

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D97358: [X86] Support amx-bf16 intrinsic.

2021-02-24 Thread Pengfei Wang via Phabricator via cfe-commits
pengfei accepted this revision.
pengfei added a comment.
This revision is now accepted and ready to land.

LGTM.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D97358/new/

https://reviews.llvm.org/D97358

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D97358: [X86] Support amx-bf16 intrinsic.

2021-02-24 Thread LiuChen via Phabricator via cfe-commits
LiuChen3 updated this revision to Diff 326002.
LiuChen3 added a comment.

Address Pengfei and Yuanke's comments. We don't need more tile type.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D97358/new/

https://reviews.llvm.org/D97358

Files:
  clang/include/clang/Basic/BuiltinsX86_64.def
  clang/lib/Headers/amxintrin.h
  clang/test/CodeGen/X86/amx_api.c
  llvm/include/llvm/IR/IntrinsicsX86.td
  llvm/lib/Target/X86/X86ExpandPseudo.cpp
  llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
  llvm/lib/Target/X86/X86InstrAMX.td
  llvm/lib/Target/X86/X86LowerAMXType.cpp
  llvm/lib/Target/X86/X86PreTileConfig.cpp
  llvm/lib/Target/X86/X86RegisterInfo.cpp
  llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll

Index: llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll
===
--- llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll
+++ llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-tile -mattr=+avx512f -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-int8,+amx-bf16,+avx512f -verify-machineinstrs | FileCheck %s
 
 define void @test_amx(i8* %pointer, i8* %base, i64 %stride) {
 ; CHECK-LABEL: test_amx:
@@ -22,6 +22,7 @@
 ; CHECK-NEXT:tdpbsud %tmm2, %tmm1, %tmm0
 ; CHECK-NEXT:tdpbusd %tmm2, %tmm1, %tmm0
 ; CHECK-NEXT:tdpbuud %tmm2, %tmm1, %tmm0
+; CHECK-NEXT:tdpbf16ps %tmm2, %tmm1, %tmm0
 ; CHECK-NEXT:tilestored %tmm0, (%rdi,%rdx)
 ; CHECK-NEXT:tilerelease
 ; CHECK-NEXT:vzeroupper
@@ -33,7 +34,8 @@
   %d1 = call x86_amx @llvm.x86.tdpbsud.internal(i16 8, i16 8, i16 8, x86_amx %d0, x86_amx %a, x86_amx %b)
   %d2 = call x86_amx @llvm.x86.tdpbusd.internal(i16 8, i16 8, i16 8, x86_amx %d1, x86_amx %a, x86_amx %b)
   %d3 = call x86_amx @llvm.x86.tdpbuud.internal(i16 8, i16 8, i16 8, x86_amx %d2, x86_amx %a, x86_amx %b)
-  call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %d3)
+  %d4 = call x86_amx @llvm.x86.tdpbf16ps.internal(i16 8, i16 8, i16 8, x86_amx %d3, x86_amx %a, x86_amx %b)
+  call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %d4)
 
   ret void
 }
@@ -44,4 +46,5 @@
 declare x86_amx @llvm.x86.tdpbsud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
 declare x86_amx @llvm.x86.tdpbusd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
 declare x86_amx @llvm.x86.tdpbuud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
+declare x86_amx @llvm.x86.tdpbf16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
 declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx)
Index: llvm/lib/Target/X86/X86RegisterInfo.cpp
===
--- llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -888,6 +888,7 @@
   case X86::PTDPBUSDV:
   case X86::PTDPBUUDV:
   case X86::PTILEZEROV:
+  case X86::PTDPBF16PSV:
 MachineOperand  = MI->getOperand(1);
 MachineOperand  = MI->getOperand(2);
 ShapeT Shape(, , MRI);
Index: llvm/lib/Target/X86/X86PreTileConfig.cpp
===
--- llvm/lib/Target/X86/X86PreTileConfig.cpp
+++ llvm/lib/Target/X86/X86PreTileConfig.cpp
@@ -159,6 +159,7 @@
   case X86::PTDPBUSDV:
   case X86::PTDPBUUDV:
   case X86::PTILEZEROV:
+  case X86::PTDPBF16PSV:
 MachineOperand  = const_cast(MI.getOperand(1));
 MachineOperand  = const_cast(MI.getOperand(2));
 ShapeT Shape(, , MRI);
@@ -256,6 +257,7 @@
   case X86::PTDPBUSDV:
   case X86::PTDPBUUDV:
   case X86::PTILEZEROV:
+  case X86::PTDPBF16PSV:
 return true;
   }
 }
Index: llvm/lib/Target/X86/X86LowerAMXType.cpp
===
--- llvm/lib/Target/X86/X86LowerAMXType.cpp
+++ llvm/lib/Target/X86/X86LowerAMXType.cpp
@@ -70,7 +70,8 @@
   case Intrinsic::x86_tdpbssd_internal:
   case Intrinsic::x86_tdpbsud_internal:
   case Intrinsic::x86_tdpbusd_internal:
-  case Intrinsic::x86_tdpbuud_internal: {
+  case Intrinsic::x86_tdpbuud_internal:
+  case Intrinsic::x86_tdpbf16ps_internal: {
 switch (OpNo) {
 case 3:
   Row = II->getArgOperand(0);
Index: llvm/lib/Target/X86/X86InstrAMX.td
===
--- llvm/lib/Target/X86/X86InstrAMX.td
+++ llvm/lib/Target/X86/X86InstrAMX.td
@@ -138,6 +138,16 @@
   "tdpbf16ps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
   []>, VEX_4V, T8XS;
 
+// Pseduo instruction for RA.
+let Constraints = "$src4 = $dst" in
+  def PTDPBF16PSV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
+ GR16:$src2, GR16:$src3, TILE:$src4,
+ TILE:$src5, TILE:$src6),
+  

[PATCH] D97358: [X86] Support amx-bf16 intrinsic.

2021-02-23 Thread Pengfei Wang via Phabricator via cfe-commits
pengfei added inline comments.



Comment at: clang/lib/Headers/amxintrin.h:283
 
+typedef struct __tile1024bf16_str {
+  const unsigned short row;

LiuChen3 wrote:
> LuoYuanke wrote:
> > pengfei wrote:
> > > Is there much value to differentiate the type? We are using the same AMX 
> > > type in the builtins. What do you think? @LuoYuanke 
> > My first though is that we can reuse __tile1024i for bf16 tile for 2 
> > reasons.
> > 1. We don't access the element of the tile. 
> > 2. The destination element of amx-int8 is int32 and the destination element 
> > of amx-bf16 is float32, the element size is the same.
> Does this means that user need to do explicitly type conversion?
We don't allow user to use the tile in the structure. User should always use 
load/store intrinsic to pass there own data.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D97358/new/

https://reviews.llvm.org/D97358

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D97358: [X86] Support amx-bf16 intrinsic.

2021-02-23 Thread LiuChen via Phabricator via cfe-commits
LiuChen3 added inline comments.



Comment at: clang/lib/Headers/amxintrin.h:283
 
+typedef struct __tile1024bf16_str {
+  const unsigned short row;

LuoYuanke wrote:
> pengfei wrote:
> > Is there much value to differentiate the type? We are using the same AMX 
> > type in the builtins. What do you think? @LuoYuanke 
> My first though is that we can reuse __tile1024i for bf16 tile for 2 reasons.
> 1. We don't access the element of the tile. 
> 2. The destination element of amx-int8 is int32 and the destination element 
> of amx-bf16 is float32, the element size is the same.
Does this means that user need to do explicitly type conversion?


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D97358/new/

https://reviews.llvm.org/D97358

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D97358: [X86] Support amx-bf16 intrinsic.

2021-02-23 Thread LuoYuanke via Phabricator via cfe-commits
LuoYuanke added inline comments.



Comment at: clang/lib/Headers/amxintrin.h:283
 
+typedef struct __tile1024bf16_str {
+  const unsigned short row;

pengfei wrote:
> Is there much value to differentiate the type? We are using the same AMX type 
> in the builtins. What do you think? @LuoYuanke 
My first though is that we can reuse __tile1024i for bf16 tile for 2 reasons.
1. We don't access the element of the tile. 
2. The destination element of amx-int8 is int32 and the destination element of 
amx-bf16 is float32, the element size is the same.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D97358/new/

https://reviews.llvm.org/D97358

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D97358: [X86] Support amx-bf16 intrinsic.

2021-02-23 Thread Pengfei Wang via Phabricator via cfe-commits
pengfei added inline comments.



Comment at: clang/lib/Headers/amxintrin.h:283
 
+typedef struct __tile1024bf16_str {
+  const unsigned short row;

Is there much value to differentiate the type? We are using the same AMX type 
in the builtins. What do you think? @LuoYuanke 


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D97358/new/

https://reviews.llvm.org/D97358

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D97358: [X86] Support amx-bf16 intrinsic.

2021-02-23 Thread LiuChen via Phabricator via cfe-commits
LiuChen3 updated this revision to Diff 325988.
LiuChen3 added a comment.

Adding back 'avx512f' to amx-tile-basic.ll


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D97358/new/

https://reviews.llvm.org/D97358

Files:
  clang/include/clang/Basic/BuiltinsX86_64.def
  clang/lib/Headers/amxintrin.h
  clang/test/CodeGen/X86/amx_api.c
  llvm/include/llvm/IR/IntrinsicsX86.td
  llvm/lib/Target/X86/X86ExpandPseudo.cpp
  llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
  llvm/lib/Target/X86/X86InstrAMX.td
  llvm/lib/Target/X86/X86LowerAMXType.cpp
  llvm/lib/Target/X86/X86PreTileConfig.cpp
  llvm/lib/Target/X86/X86RegisterInfo.cpp
  llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll

Index: llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll
===
--- llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll
+++ llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-tile -mattr=+avx512f -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-int8,+amx-bf16,+avx512f -verify-machineinstrs | FileCheck %s
 
 define void @test_amx(i8* %pointer, i8* %base, i64 %stride) {
 ; CHECK-LABEL: test_amx:
@@ -22,6 +22,7 @@
 ; CHECK-NEXT:tdpbsud %tmm2, %tmm1, %tmm0
 ; CHECK-NEXT:tdpbusd %tmm2, %tmm1, %tmm0
 ; CHECK-NEXT:tdpbuud %tmm2, %tmm1, %tmm0
+; CHECK-NEXT:tdpbf16ps %tmm2, %tmm1, %tmm0
 ; CHECK-NEXT:tilestored %tmm0, (%rdi,%rdx)
 ; CHECK-NEXT:tilerelease
 ; CHECK-NEXT:vzeroupper
@@ -33,7 +34,8 @@
   %d1 = call x86_amx @llvm.x86.tdpbsud.internal(i16 8, i16 8, i16 8, x86_amx %d0, x86_amx %a, x86_amx %b)
   %d2 = call x86_amx @llvm.x86.tdpbusd.internal(i16 8, i16 8, i16 8, x86_amx %d1, x86_amx %a, x86_amx %b)
   %d3 = call x86_amx @llvm.x86.tdpbuud.internal(i16 8, i16 8, i16 8, x86_amx %d2, x86_amx %a, x86_amx %b)
-  call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %d3)
+  %d4 = call x86_amx @llvm.x86.tdpbf16ps.internal(i16 8, i16 8, i16 8, x86_amx %d3, x86_amx %a, x86_amx %b)
+  call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %d4)
 
   ret void
 }
@@ -44,4 +46,5 @@
 declare x86_amx @llvm.x86.tdpbsud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
 declare x86_amx @llvm.x86.tdpbusd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
 declare x86_amx @llvm.x86.tdpbuud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
+declare x86_amx @llvm.x86.tdpbf16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
 declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx)
Index: llvm/lib/Target/X86/X86RegisterInfo.cpp
===
--- llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -888,6 +888,7 @@
   case X86::PTDPBUSDV:
   case X86::PTDPBUUDV:
   case X86::PTILEZEROV:
+  case X86::PTDPBF16PSV:
 MachineOperand  = MI->getOperand(1);
 MachineOperand  = MI->getOperand(2);
 ShapeT Shape(, , MRI);
Index: llvm/lib/Target/X86/X86PreTileConfig.cpp
===
--- llvm/lib/Target/X86/X86PreTileConfig.cpp
+++ llvm/lib/Target/X86/X86PreTileConfig.cpp
@@ -159,6 +159,7 @@
   case X86::PTDPBUSDV:
   case X86::PTDPBUUDV:
   case X86::PTILEZEROV:
+  case X86::PTDPBF16PSV:
 MachineOperand  = const_cast(MI.getOperand(1));
 MachineOperand  = const_cast(MI.getOperand(2));
 ShapeT Shape(, , MRI);
@@ -256,6 +257,7 @@
   case X86::PTDPBUSDV:
   case X86::PTDPBUUDV:
   case X86::PTILEZEROV:
+  case X86::PTDPBF16PSV:
 return true;
   }
 }
Index: llvm/lib/Target/X86/X86LowerAMXType.cpp
===
--- llvm/lib/Target/X86/X86LowerAMXType.cpp
+++ llvm/lib/Target/X86/X86LowerAMXType.cpp
@@ -70,7 +70,8 @@
   case Intrinsic::x86_tdpbssd_internal:
   case Intrinsic::x86_tdpbsud_internal:
   case Intrinsic::x86_tdpbusd_internal:
-  case Intrinsic::x86_tdpbuud_internal: {
+  case Intrinsic::x86_tdpbuud_internal:
+  case Intrinsic::x86_tdpbf16ps_internal: {
 switch (OpNo) {
 case 3:
   Row = II->getArgOperand(0);
Index: llvm/lib/Target/X86/X86InstrAMX.td
===
--- llvm/lib/Target/X86/X86InstrAMX.td
+++ llvm/lib/Target/X86/X86InstrAMX.td
@@ -138,6 +138,16 @@
   "tdpbf16ps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
   []>, VEX_4V, T8XS;
 
+// Pseduo instruction for RA.
+let Constraints = "$src4 = $dst" in
+  def PTDPBF16PSV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
+ GR16:$src2, GR16:$src3, TILE:$src4,
+ TILE:$src5, TILE:$src6),
+ 

[PATCH] D97358: [X86] Support amx-bf16 intrinsic.

2021-02-23 Thread LiuChen via Phabricator via cfe-commits
LiuChen3 created this revision.
Herald added subscribers: pengfei, hiraditya.
LiuChen3 requested review of this revision.
Herald added projects: clang, LLVM.
Herald added subscribers: llvm-commits, cfe-commits.

Adding support for intrinsics of AMX-BF16.
This patch alse fix a bug that AMX-INT8 instructions will be selected with wrong
predicate.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D97358

Files:
  clang/include/clang/Basic/BuiltinsX86_64.def
  clang/lib/Headers/amxintrin.h
  clang/test/CodeGen/X86/amx_api.c
  llvm/include/llvm/IR/IntrinsicsX86.td
  llvm/lib/Target/X86/X86ExpandPseudo.cpp
  llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
  llvm/lib/Target/X86/X86InstrAMX.td
  llvm/lib/Target/X86/X86LowerAMXType.cpp
  llvm/lib/Target/X86/X86PreTileConfig.cpp
  llvm/lib/Target/X86/X86RegisterInfo.cpp
  llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll

Index: llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll
===
--- llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll
+++ llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll
@@ -1,11 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-tile -mattr=+avx512f -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-int8,+amx-bf16 -verify-machineinstrs | FileCheck %s
 
 define void @test_amx(i8* %pointer, i8* %base, i64 %stride) {
 ; CHECK-LABEL: test_amx:
 ; CHECK:   # %bb.0:
-; CHECK-NEXT:vpxord %zmm0, %zmm0, %zmm0
-; CHECK-NEXT:vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:xorps %xmm0, %xmm0
+; CHECK-NEXT:movups %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:movups %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:movups %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:movups %xmm0, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:movb $1, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:movb $8, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:movw $8, -{{[0-9]+}}(%rsp)
@@ -22,9 +25,9 @@
 ; CHECK-NEXT:tdpbsud %tmm2, %tmm1, %tmm0
 ; CHECK-NEXT:tdpbusd %tmm2, %tmm1, %tmm0
 ; CHECK-NEXT:tdpbuud %tmm2, %tmm1, %tmm0
+; CHECK-NEXT:tdpbf16ps %tmm2, %tmm1, %tmm0
 ; CHECK-NEXT:tilestored %tmm0, (%rdi,%rdx)
 ; CHECK-NEXT:tilerelease
-; CHECK-NEXT:vzeroupper
 ; CHECK-NEXT:retq
   %c = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8)
   %a = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %base, i64 %stride)
@@ -33,7 +36,8 @@
   %d1 = call x86_amx @llvm.x86.tdpbsud.internal(i16 8, i16 8, i16 8, x86_amx %d0, x86_amx %a, x86_amx %b)
   %d2 = call x86_amx @llvm.x86.tdpbusd.internal(i16 8, i16 8, i16 8, x86_amx %d1, x86_amx %a, x86_amx %b)
   %d3 = call x86_amx @llvm.x86.tdpbuud.internal(i16 8, i16 8, i16 8, x86_amx %d2, x86_amx %a, x86_amx %b)
-  call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %d3)
+  %d4 = call x86_amx @llvm.x86.tdpbf16ps.internal(i16 8, i16 8, i16 8, x86_amx %d3, x86_amx %a, x86_amx %b)
+  call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %d4)
 
   ret void
 }
@@ -44,4 +48,5 @@
 declare x86_amx @llvm.x86.tdpbsud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
 declare x86_amx @llvm.x86.tdpbusd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
 declare x86_amx @llvm.x86.tdpbuud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
+declare x86_amx @llvm.x86.tdpbf16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
 declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx)
Index: llvm/lib/Target/X86/X86RegisterInfo.cpp
===
--- llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -888,6 +888,7 @@
   case X86::PTDPBUSDV:
   case X86::PTDPBUUDV:
   case X86::PTILEZEROV:
+  case X86::PTDPBF16PSV:
 MachineOperand  = MI->getOperand(1);
 MachineOperand  = MI->getOperand(2);
 ShapeT Shape(, , MRI);
Index: llvm/lib/Target/X86/X86PreTileConfig.cpp
===
--- llvm/lib/Target/X86/X86PreTileConfig.cpp
+++ llvm/lib/Target/X86/X86PreTileConfig.cpp
@@ -159,6 +159,7 @@
   case X86::PTDPBUSDV:
   case X86::PTDPBUUDV:
   case X86::PTILEZEROV:
+  case X86::PTDPBF16PSV:
 MachineOperand  = const_cast(MI.getOperand(1));
 MachineOperand  = const_cast(MI.getOperand(2));
 ShapeT Shape(, , MRI);
@@ -256,6 +257,7 @@
   case X86::PTDPBUSDV:
   case X86::PTDPBUUDV:
   case X86::PTILEZEROV:
+  case X86::PTDPBF16PSV:
 return true;
   }
 }
Index: llvm/lib/Target/X86/X86LowerAMXType.cpp
===
--- llvm/lib/Target/X86/X86LowerAMXType.cpp
+++ llvm/lib/Target/X86/X86LowerAMXType.cpp
@@ -70,7 +70,8 @@
   case Intrinsic::x86_tdpbssd_internal:
   case Intrinsic::x86_tdpbsud_internal:
   case Intrinsic::x86_tdpbusd_internal:
-  case