[PATCH] D157251: [X86][regcall] Do not produce @ number suffix if it is regcall4

2023-08-09 Thread Bing Yu via Phabricator via cfe-commits
yubing updated this revision to Diff 548479.
yubing added a comment.

small fix


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D157251/new/

https://reviews.llvm.org/D157251

Files:
  clang/lib/AST/Mangle.cpp
  clang/test/CodeGen/mangle-windows-regcall4.c


Index: clang/test/CodeGen/mangle-windows-regcall4.c
===
--- /dev/null
+++ clang/test/CodeGen/mangle-windows-regcall4.c
@@ -0,0 +1,43 @@
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=i386-pc-win32 -regcall4 | 
FileCheck %s
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=i386-mingw32 -regcall4 | 
FileCheck %s
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=i386-pc-windows-msvc-elf 
-regcall4 | FileCheck %s --check-prefix=ELF32
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=x86_64-pc-win32 -regcall4 | 
FileCheck %s --check-prefix=X64
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=x86_64-mingw32 -regcall4 | 
FileCheck %s --check-prefix=X64
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=x86_64-pc-windows-msvc-elf 
-regcall4 | FileCheck %s --check-prefix=ELF64
+
+// CHECK: target datalayout = "e-m:x-{{.*}}"
+// X64: target datalayout = "e-m:w-{{.*}}"
+// ELF32: target datalayout = "e-m:e-{{.*}}"
+// ELF64: target datalayout = "e-m:e-{{.*}}"
+
+void __regcall v1(void) {}
+// CHECK: define dso_local x86_regcallcc void @__regcall4__v1
+// X64: define dso_local x86_regcallcc void @__regcall4__v1
+// ELF32: define{{.*}} x86_regcallcc void @__regcall4__v1
+// ELF64: define{{.*}} x86_regcallcc void @__regcall4__v1
+
+void __regcall v2(char a) {}
+// CHECK: define dso_local x86_regcallcc void @__regcall4__v2
+// X64: define dso_local x86_regcallcc void @__regcall4__v2
+// ELF32: define{{.*}} x86_regcallcc void @__regcall4__v2
+// ELF64: define{{.*}} x86_regcallcc void @__regcall4__v2
+
+void __regcall v3(short a) {}
+// CHECK: define dso_local x86_regcallcc void @__regcall4__v3
+// X64: define dso_local x86_regcallcc void @__regcall4__v3
+
+void __regcall v4(int a) {}
+// CHECK: define dso_local x86_regcallcc void @__regcall4__v4
+// X64: define dso_local x86_regcallcc void @__regcall4__v4
+
+void __regcall v5(long long a) {}
+// CHECK: define dso_local x86_regcallcc void @__regcall4__v5
+// X64: define dso_local x86_regcallcc void @__regcall4__v5
+
+void __regcall v6(char a, char b) {}
+// CHECK: define dso_local x86_regcallcc void @__regcall4__v6(i8 inreg noundef 
signext %a, i8 inreg noundef signext %b)
+// X64: define dso_local x86_regcallcc void @__regcall4__v6(i8 noundef %a, i8 
noundef %b)
+
+void __regcall v7(long long a, char b, char c, short d) {}
+// CHECK: define dso_local x86_regcallcc void @__regcall4__v7(i64 noundef %a, 
i8 inreg noundef signext %b, i8 inreg noundef signext %c, i16 inreg noundef 
signext %d)
+// X64: define dso_local x86_regcallcc void @__regcall4__v7(i64 noundef %a, i8 
noundef %b, i8 noundef %c, i16 noundef %d)
Index: clang/lib/AST/Mangle.cpp
===
--- clang/lib/AST/Mangle.cpp
+++ clang/lib/AST/Mangle.cpp
@@ -212,6 +212,10 @@
   else
 mangleCXXName(GD, Out);
 
+  // do no produce @ suffix if it is regcall4
+  if (getASTContext().getLangOpts().RegCall4)
+return;
+
   const FunctionDecl *FD = cast(D);
   const FunctionType *FT = FD->getType()->castAs();
   const FunctionProtoType *Proto = dyn_cast(FT);


Index: clang/test/CodeGen/mangle-windows-regcall4.c
===
--- /dev/null
+++ clang/test/CodeGen/mangle-windows-regcall4.c
@@ -0,0 +1,43 @@
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=i386-pc-win32 -regcall4 | FileCheck %s
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=i386-mingw32 -regcall4 | FileCheck %s
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=i386-pc-windows-msvc-elf -regcall4 | FileCheck %s --check-prefix=ELF32
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=x86_64-pc-win32 -regcall4 | FileCheck %s --check-prefix=X64
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=x86_64-mingw32 -regcall4 | FileCheck %s --check-prefix=X64
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=x86_64-pc-windows-msvc-elf -regcall4 | FileCheck %s --check-prefix=ELF64
+
+// CHECK: target datalayout = "e-m:x-{{.*}}"
+// X64: target datalayout = "e-m:w-{{.*}}"
+// ELF32: target datalayout = "e-m:e-{{.*}}"
+// ELF64: target datalayout = "e-m:e-{{.*}}"
+
+void __regcall v1(void) {}
+// CHECK: define dso_local x86_regcallcc void @__regcall4__v1
+// X64: define dso_local x86_regcallcc void @__regcall4__v1
+// ELF32: define{{.*}} x86_regcallcc void @__regcall4__v1
+// ELF64: define{{.*}} x86_regcallcc void @__regcall4__v1
+
+void __regcall v2(char a) {}
+// CHECK: define dso_local x86_regcallcc void @__regcall4__v2
+// X64: define dso_local x86_regcallcc void @__regcall4__v2
+// ELF32: define{{.*}} x86_regcallcc void @__regcall4__v2
+// ELF64: define{{.*}} x86_regcallcc void @__regcall4__v2
+
+void __regcall v3(short a) {}

[PATCH] D157251: [X86][regcall] Do not produce @ number suffix if it is regcall4

2023-08-09 Thread Bing Yu via Phabricator via cfe-commits
yubing updated this revision to Diff 548478.
yubing added a comment.

small fix


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D157251/new/

https://reviews.llvm.org/D157251

Files:
  clang/lib/AST/Mangle.cpp
  clang/test/CodeGen/mangle-windows-regcall4.c


Index: clang/test/CodeGen/mangle-windows-regcall4.c
===
--- /dev/null
+++ clang/test/CodeGen/mangle-windows-regcall4.c
@@ -0,0 +1,43 @@
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=i386-pc-win32 -regcall4 | 
FileCheck %s
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=i386-mingw32 -regcall4 | 
FileCheck %s
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=i386-pc-windows-msvc-elf 
-regcall4 | FileCheck %s --check-prefix=ELF32
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=x86_64-pc-win32 -regcall4 | 
FileCheck %s --check-prefix=X64
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=x86_64-mingw32 -regcall4 | 
FileCheck %s --check-prefix=X64
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=x86_64-pc-windows-msvc-elf 
-regcall4 | FileCheck %s --check-prefix=ELF64
+
+// CHECK: target datalayout = "e-m:x-{{.*}}"
+// X64: target datalayout = "e-m:w-{{.*}}"
+// ELF32: target datalayout = "e-m:e-{{.*}}"
+// ELF64: target datalayout = "e-m:e-{{.*}}"
+
+void __regcall v1(void) {}
+// CHECK: define dso_local x86_regcallcc void @__regcall4__v1
+// X64: define dso_local x86_regcallcc void @__regcall4__v1
+// ELF32: define{{.*}} x86_regcallcc void @__regcall4__v1
+// ELF64: define{{.*}} x86_regcallcc void @__regcall4__v1
+
+void __regcall v2(char a) {}
+// CHECK: define dso_local x86_regcallcc void @__regcall4__v2
+// X64: define dso_local x86_regcallcc void @__regcall4__v2
+// ELF32: define{{.*}} x86_regcallcc void @__regcall4__v2
+// ELF64: define{{.*}} x86_regcallcc void @__regcall4__v2
+
+void __regcall v3(short a) {}
+// CHECK: define dso_local x86_regcallcc void @__regcall4__v3
+// X64: define dso_local x86_regcallcc void @__regcall4__v3
+
+void __regcall v4(int a) {}
+// CHECK: define dso_local x86_regcallcc void @__regcall4__v4
+// X64: define dso_local x86_regcallcc void @__regcall4__v4
+
+void __regcall v5(long long a) {}
+// CHECK: define dso_local x86_regcallcc void @__regcall4__v5
+// X64: define dso_local x86_regcallcc void @__regcall4__v5
+
+void __regcall v6(char a, char b) {}
+// CHECK: define dso_local x86_regcallcc void @__regcall4__v6(i8 inreg noundef 
signext %a, i8 inreg noundef signext %b)
+// X64: define dso_local x86_regcallcc void @__regcall4__v6(i8 noundef %a, i8 
noundef %b)
+
+void __regcall v7(long long a, char b, char c, short d) {}
+// CHECK: define dso_local x86_regcallcc void @__regcall4__v7(i64 noundef %a, 
i8 inreg noundef signext %b, i8 inreg noundef signext %c, i16 inreg noundef 
signext %d)
+// X64: define dso_local x86_regcallcc void @__regcall4__v7(i64 noundef %a, i8 
noundef %b, i8 noundef %c, i16 noundef %d)
\ No newline at end of file
Index: clang/lib/AST/Mangle.cpp
===
--- clang/lib/AST/Mangle.cpp
+++ clang/lib/AST/Mangle.cpp
@@ -212,6 +212,10 @@
   else
 mangleCXXName(GD, Out);
 
+  // do no produce @ suffix if it is regcall4
+  if (getASTContext().getLangOpts().RegCall4)
+return;
+
   const FunctionDecl *FD = cast(D);
   const FunctionType *FT = FD->getType()->castAs();
   const FunctionProtoType *Proto = dyn_cast(FT);


Index: clang/test/CodeGen/mangle-windows-regcall4.c
===
--- /dev/null
+++ clang/test/CodeGen/mangle-windows-regcall4.c
@@ -0,0 +1,43 @@
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=i386-pc-win32 -regcall4 | FileCheck %s
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=i386-mingw32 -regcall4 | FileCheck %s
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=i386-pc-windows-msvc-elf -regcall4 | FileCheck %s --check-prefix=ELF32
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=x86_64-pc-win32 -regcall4 | FileCheck %s --check-prefix=X64
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=x86_64-mingw32 -regcall4 | FileCheck %s --check-prefix=X64
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=x86_64-pc-windows-msvc-elf -regcall4 | FileCheck %s --check-prefix=ELF64
+
+// CHECK: target datalayout = "e-m:x-{{.*}}"
+// X64: target datalayout = "e-m:w-{{.*}}"
+// ELF32: target datalayout = "e-m:e-{{.*}}"
+// ELF64: target datalayout = "e-m:e-{{.*}}"
+
+void __regcall v1(void) {}
+// CHECK: define dso_local x86_regcallcc void @__regcall4__v1
+// X64: define dso_local x86_regcallcc void @__regcall4__v1
+// ELF32: define{{.*}} x86_regcallcc void @__regcall4__v1
+// ELF64: define{{.*}} x86_regcallcc void @__regcall4__v1
+
+void __regcall v2(char a) {}
+// CHECK: define dso_local x86_regcallcc void @__regcall4__v2
+// X64: define dso_local x86_regcallcc void @__regcall4__v2
+// ELF32: define{{.*}} x86_regcallcc void @__regcall4__v2
+// ELF64: define{{.*}} x86_regcallcc void @__regcall4__v2
+

[PATCH] D157251: [X86][regcall] Do not produce @ number suffix if it is regcall4

2023-08-09 Thread Bing Yu via Phabricator via cfe-commits
yubing updated this revision to Diff 548477.
yubing added a comment.
Herald added a subscriber: mstorsjo.

add testcase


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D157251/new/

https://reviews.llvm.org/D157251

Files:
  clang/lib/AST/Mangle.cpp
  clang/test/CodeGen/mangle-windows-regcall4.c


Index: clang/test/CodeGen/mangle-windows-regcall4.c
===
--- /dev/null
+++ clang/test/CodeGen/mangle-windows-regcall4.c
@@ -0,0 +1,42 @@
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=i386-pc-win32 -regcall4 | 
FileCheck %s
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=i386-mingw32 -regcall4 | 
FileCheck %s
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=i386-pc-windows-msvc-elf 
-regcall4 | FileCheck %s --check-prefix=ELF32
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=x86_64-pc-win32 -regcall4 | 
FileCheck %s --check-prefix=X64
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=x86_64-mingw32 -regcall4 | 
FileCheck %s --check-prefix=X64
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=x86_64-pc-windows-msvc-elf 
-regcall4 | FileCheck %s --check-prefix=ELF64
+
+// CHECK: target datalayout = "e-m:x-{{.*}}"
+// X64: target datalayout = "e-m:w-{{.*}}"
+// ELF32: target datalayout = "e-m:e-{{.*}}"
+// ELF64: target datalayout = "e-m:e-{{.*}}"
+
+void __regcall v1(void) {}
+// CHECK: define dso_local x86_regcallcc void @__regcall4__v1
+// X64: define dso_local x86_regcallcc void @__regcall4__v1
+// ELF32: define{{.*}} x86_regcallcc void @__regcall4__v1
+// ELF64: define{{.*}} x86_regcallcc void @__regcall4__v1
+
+void __regcall v2(char a) {}
+// CHECK: define dso_local x86_regcallcc void @__regcall4__v2
+// X64: define dso_local x86_regcallcc void @__regcall4__v2
+// ELF32: define{{.*}} x86_regcallcc void @__regcall4__v2
+// ELF64: define{{.*}} x86_regcallcc void @__regcall4__v2
+
+void __regcall v3(short a) {}
+// CHECK: define dso_local x86_regcallcc void @__regcall4__v3
+// X64: define dso_local x86_regcallcc void @__regcall4__v3
+
+void __regcall v4(int a) {}
+// CHECK: define dso_local x86_regcallcc void @__regcall4__v4
+// X64: define dso_local x86_regcallcc void @__regcall4__v4
+
+void __regcall v5(long long a) {}
+// CHECK: define dso_local x86_regcallcc void @__regcall4__v5
+// X64: define dso_local x86_regcallcc void @__regcall4__v5
+
+void __regcall v6(char a, char b) {}
+// CHECK: define dso_local x86_regcallcc void @__regcall4__v6
+// X64: define dso_local x86_regcallcc void @__regcall4__v6
+
+void __regcall v7(long long a, char b, char c, short d) {}
+// CHECK: define dso_local x86_regcallcc void @__regcall4__v7(i64 noundef %a, 
i8 inreg noundef signext %b, i8 inreg noundef signext %c, i16 inreg noundef 
signext %d)
\ No newline at end of file
Index: clang/lib/AST/Mangle.cpp
===
--- clang/lib/AST/Mangle.cpp
+++ clang/lib/AST/Mangle.cpp
@@ -212,6 +212,10 @@
   else
 mangleCXXName(GD, Out);
 
+  // do no produce @ suffix if it is regcall4
+  if (getASTContext().getLangOpts().RegCall4)
+return;
+
   const FunctionDecl *FD = cast(D);
   const FunctionType *FT = FD->getType()->castAs();
   const FunctionProtoType *Proto = dyn_cast(FT);


Index: clang/test/CodeGen/mangle-windows-regcall4.c
===
--- /dev/null
+++ clang/test/CodeGen/mangle-windows-regcall4.c
@@ -0,0 +1,42 @@
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=i386-pc-win32 -regcall4 | FileCheck %s
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=i386-mingw32 -regcall4 | FileCheck %s
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=i386-pc-windows-msvc-elf -regcall4 | FileCheck %s --check-prefix=ELF32
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=x86_64-pc-win32 -regcall4 | FileCheck %s --check-prefix=X64
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=x86_64-mingw32 -regcall4 | FileCheck %s --check-prefix=X64
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=x86_64-pc-windows-msvc-elf -regcall4 | FileCheck %s --check-prefix=ELF64
+
+// CHECK: target datalayout = "e-m:x-{{.*}}"
+// X64: target datalayout = "e-m:w-{{.*}}"
+// ELF32: target datalayout = "e-m:e-{{.*}}"
+// ELF64: target datalayout = "e-m:e-{{.*}}"
+
+void __regcall v1(void) {}
+// CHECK: define dso_local x86_regcallcc void @__regcall4__v1
+// X64: define dso_local x86_regcallcc void @__regcall4__v1
+// ELF32: define{{.*}} x86_regcallcc void @__regcall4__v1
+// ELF64: define{{.*}} x86_regcallcc void @__regcall4__v1
+
+void __regcall v2(char a) {}
+// CHECK: define dso_local x86_regcallcc void @__regcall4__v2
+// X64: define dso_local x86_regcallcc void @__regcall4__v2
+// ELF32: define{{.*}} x86_regcallcc void @__regcall4__v2
+// ELF64: define{{.*}} x86_regcallcc void @__regcall4__v2
+
+void __regcall v3(short a) {}
+// CHECK: define dso_local x86_regcallcc void @__regcall4__v3
+// X64: define dso_local x86_regcallcc void @__regcall4__v3
+
+void __regcall 

[PATCH] D157251: [X86][regcall] Do not produce @ number suffix if it is regcall4

2023-08-06 Thread Bing Yu via Phabricator via cfe-commits
yubing created this revision.
Herald added a project: All.
yubing requested review of this revision.
Herald added a project: clang.
Herald added a subscriber: cfe-commits.

Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D157251

Files:
  clang/lib/AST/Mangle.cpp


Index: clang/lib/AST/Mangle.cpp
===
--- clang/lib/AST/Mangle.cpp
+++ clang/lib/AST/Mangle.cpp
@@ -212,6 +212,10 @@
   else
 mangleCXXName(GD, Out);
 
+  // do no produce @ suffix if it is regcall4
+  if (getASTContext().getLangOpts().RegCall4)
+return;
+
   const FunctionDecl *FD = cast(D);
   const FunctionType *FT = FD->getType()->castAs();
   const FunctionProtoType *Proto = dyn_cast(FT);


Index: clang/lib/AST/Mangle.cpp
===
--- clang/lib/AST/Mangle.cpp
+++ clang/lib/AST/Mangle.cpp
@@ -212,6 +212,10 @@
   else
 mangleCXXName(GD, Out);
 
+  // do no produce @ suffix if it is regcall4
+  if (getASTContext().getLangOpts().RegCall4)
+return;
+
   const FunctionDecl *FD = cast(D);
   const FunctionType *FT = FD->getType()->castAs();
   const FunctionProtoType *Proto = dyn_cast(FT);
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D155863: [X86][Regcall] Add an option to respect regcall ABI v.4 in win64

2023-08-02 Thread Bing Yu via Phabricator via cfe-commits
This revision was automatically updated to reflect the committed changes.
Closed by commit rG6ee497aa0b48: [X86][Regcall] Add an option to respect 
regcall ABI v.4 in win64win32 (authored by yubing).

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D155863/new/

https://reviews.llvm.org/D155863

Files:
  clang/include/clang/Basic/LangOptions.def
  clang/include/clang/Driver/Options.td
  clang/lib/AST/ItaniumMangle.cpp
  clang/lib/AST/Mangle.cpp
  clang/lib/AST/MicrosoftMangle.cpp
  clang/lib/CodeGen/CodeGenModule.cpp
  clang/lib/Driver/ToolChains/Clang.cpp
  clang/test/CodeGen/check-regcall4-moduleflag.c
  clang/test/CodeGen/regcall4.c
  clang/test/CodeGenCXX/regcall4.cpp
  clang/test/Driver/cl-cc-flags.c
  llvm/lib/Target/X86/X86CallingConv.td
  llvm/test/CodeGen/X86/sse-regcall4.ll

Index: llvm/test/CodeGen/X86/sse-regcall4.ll
===
--- /dev/null
+++ llvm/test/CodeGen/X86/sse-regcall4.ll
@@ -0,0 +1,467 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-pc-win32 -mattr=+sse | FileCheck --check-prefix=WIN32 %s
+; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+sse | FileCheck --check-prefix=WIN64 %s
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=+sse | FileCheck --check-prefix=LINUXOSX %s
+
+; Test regcall when receiving/returning i1
+define x86_regcallcc i1 @test_argReti1(i1 %a)  {
+; WIN32-LABEL: test_argReti1:
+; WIN32:   # %bb.0:
+; WIN32-NEXT:incb %cl
+; WIN32-NEXT:# kill: def $cl killed $cl killed $ecx
+; WIN32-NEXT:retl
+;
+; WIN64-LABEL: test_argReti1:
+; WIN64:   # %bb.0:
+; WIN64-NEXT:incb %al
+; WIN64-NEXT:# kill: def $al killed $al killed $eax
+; WIN64-NEXT:retq
+;
+; LINUXOSX-LABEL: test_argReti1:
+; LINUXOSX:   # %bb.0:
+; LINUXOSX-NEXT:incb %al
+; LINUXOSX-NEXT:# kill: def $al killed $al killed $eax
+; LINUXOSX-NEXT:retq
+  %add = add i1 %a, 1
+  ret i1 %add
+}
+
+; Test regcall when passing/retrieving i1
+define x86_regcallcc i1 @test_CallargReti1(i1 %a)  {
+; WIN32-LABEL: test_CallargReti1:
+; WIN32:   # %bb.0:
+; WIN32-NEXT:incb %cl
+; WIN32-NEXT:movzbl %cl, %ecx
+; WIN32-NEXT:calll _test_argReti1
+; WIN32-NEXT:incb %cl
+; WIN32-NEXT:retl
+;
+; WIN64-LABEL: test_CallargReti1:
+; WIN64:   # %bb.0:
+; WIN64-NEXT:pushq %rax
+; WIN64-NEXT:.seh_stackalloc 8
+; WIN64-NEXT:.seh_endprologue
+; WIN64-NEXT:incb %al
+; WIN64-NEXT:movzbl %al, %eax
+; WIN64-NEXT:callq test_argReti1
+; WIN64-NEXT:incb %al
+; WIN64-NEXT:popq %rcx
+; WIN64-NEXT:retq
+; WIN64-NEXT:.seh_endproc
+;
+; LINUXOSX-LABEL: test_CallargReti1:
+; LINUXOSX:   # %bb.0:
+; LINUXOSX-NEXT:pushq %rax
+; LINUXOSX-NEXT:.cfi_def_cfa_offset 16
+; LINUXOSX-NEXT:incb %al
+; LINUXOSX-NEXT:movzbl %al, %eax
+; LINUXOSX-NEXT:callq *test_argReti1@GOTPCREL(%rip)
+; LINUXOSX-NEXT:incb %al
+; LINUXOSX-NEXT:popq %rcx
+; LINUXOSX-NEXT:.cfi_def_cfa_offset 8
+; LINUXOSX-NEXT:retq
+  %b = add i1 %a, 1
+  %c = call x86_regcallcc i1 @test_argReti1(i1 %b)
+  %d = add i1 %c, 1
+  ret i1 %d
+}
+
+;test calling conventions - input parameters, callee saved xmms
+define x86_regcallcc <16 x float> @testf32_inp(<16 x float> %a, <16 x float> %b, <16 x float> %c) nounwind {
+; WIN32-LABEL: testf32_inp:
+; WIN32:   # %bb.0:
+; WIN32-NEXT:pushl %ebp
+; WIN32-NEXT:movl %esp, %ebp
+; WIN32-NEXT:andl $-16, %esp
+; WIN32-NEXT:subl $32, %esp
+; WIN32-NEXT:movaps %xmm7, (%esp) # 16-byte Spill
+; WIN32-NEXT:movaps %xmm6, %xmm7
+; WIN32-NEXT:movaps %xmm5, %xmm6
+; WIN32-NEXT:movaps %xmm3, %xmm5
+; WIN32-NEXT:movaps %xmm2, %xmm3
+; WIN32-NEXT:movaps %xmm1, %xmm2
+; WIN32-NEXT:movaps %xmm0, %xmm1
+; WIN32-NEXT:addps %xmm4, %xmm0
+; WIN32-NEXT:mulps %xmm4, %xmm1
+; WIN32-NEXT:subps %xmm1, %xmm0
+; WIN32-NEXT:movups 8(%ebp), %xmm1
+; WIN32-NEXT:addps %xmm1, %xmm0
+; WIN32-NEXT:movaps %xmm2, %xmm4
+; WIN32-NEXT:addps %xmm6, %xmm4
+; WIN32-NEXT:mulps %xmm6, %xmm2
+; WIN32-NEXT:subps %xmm2, %xmm4
+; WIN32-NEXT:movups 24(%ebp), %xmm1
+; WIN32-NEXT:addps %xmm1, %xmm4
+; WIN32-NEXT:movaps %xmm3, %xmm2
+; WIN32-NEXT:addps %xmm7, %xmm2
+; WIN32-NEXT:mulps %xmm7, %xmm3
+; WIN32-NEXT:subps %xmm3, %xmm2
+; WIN32-NEXT:movups 40(%ebp), %xmm1
+; WIN32-NEXT:addps %xmm1, %xmm2
+; WIN32-NEXT:movaps %xmm5, %xmm3
+; WIN32-NEXT:movaps (%esp), %xmm1 # 16-byte Reload
+; WIN32-NEXT:addps %xmm1, %xmm3
+; WIN32-NEXT:mulps %xmm1, %xmm5
+; WIN32-NEXT:subps %xmm5, %xmm3
+; WIN32-NEXT:movups 56(%ebp), %xmm1
+; WIN32-NEXT:addps %xmm1, %xmm3
+; WIN32-NEXT:movaps %xmm4, %xmm1
+; WIN32-NEXT:movl %ebp, %esp
+; WIN32-NEXT:popl %ebp
+; WIN32-NEXT:retl
+;
+; WIN64-LABEL: testf32_inp:
+; WIN64:   # %bb.0:
+; WIN64-NEXT:subq $72, %rsp
+; 

[PATCH] D155863: [X86][Regcall] Add an option to respect regcall ABI v.4 in win64

2023-08-01 Thread Bing Yu via Phabricator via cfe-commits
yubing updated this revision to Diff 545927.
yubing added a comment.

update testcase due to w=>x


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D155863/new/

https://reviews.llvm.org/D155863

Files:
  clang/include/clang/Basic/LangOptions.def
  clang/include/clang/Driver/Options.td
  clang/lib/AST/ItaniumMangle.cpp
  clang/lib/AST/Mangle.cpp
  clang/lib/AST/MicrosoftMangle.cpp
  clang/lib/CodeGen/CodeGenModule.cpp
  clang/lib/Driver/ToolChains/Clang.cpp
  clang/test/CodeGen/check-regcall4-moduleflag.c
  clang/test/CodeGen/regcall4.c
  clang/test/CodeGenCXX/regcall4.cpp
  clang/test/Driver/cl-cc-flags.c
  llvm/lib/Target/X86/X86CallingConv.td
  llvm/test/CodeGen/X86/sse-regcall4.ll

Index: llvm/test/CodeGen/X86/sse-regcall4.ll
===
--- /dev/null
+++ llvm/test/CodeGen/X86/sse-regcall4.ll
@@ -0,0 +1,467 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-pc-win32 -mattr=+sse | FileCheck --check-prefix=WIN32 %s
+; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+sse | FileCheck --check-prefix=WIN64 %s
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=+sse | FileCheck --check-prefix=LINUXOSX %s
+
+; Test regcall when receiving/returning i1
+define x86_regcallcc i1 @test_argReti1(i1 %a)  {
+; WIN32-LABEL: test_argReti1:
+; WIN32:   # %bb.0:
+; WIN32-NEXT:incb %cl
+; WIN32-NEXT:# kill: def $cl killed $cl killed $ecx
+; WIN32-NEXT:retl
+;
+; WIN64-LABEL: test_argReti1:
+; WIN64:   # %bb.0:
+; WIN64-NEXT:incb %al
+; WIN64-NEXT:# kill: def $al killed $al killed $eax
+; WIN64-NEXT:retq
+;
+; LINUXOSX-LABEL: test_argReti1:
+; LINUXOSX:   # %bb.0:
+; LINUXOSX-NEXT:incb %al
+; LINUXOSX-NEXT:# kill: def $al killed $al killed $eax
+; LINUXOSX-NEXT:retq
+  %add = add i1 %a, 1
+  ret i1 %add
+}
+
+; Test regcall when passing/retrieving i1
+define x86_regcallcc i1 @test_CallargReti1(i1 %a)  {
+; WIN32-LABEL: test_CallargReti1:
+; WIN32:   # %bb.0:
+; WIN32-NEXT:incb %cl
+; WIN32-NEXT:movzbl %cl, %ecx
+; WIN32-NEXT:calll _test_argReti1
+; WIN32-NEXT:incb %cl
+; WIN32-NEXT:retl
+;
+; WIN64-LABEL: test_CallargReti1:
+; WIN64:   # %bb.0:
+; WIN64-NEXT:pushq %rax
+; WIN64-NEXT:.seh_stackalloc 8
+; WIN64-NEXT:.seh_endprologue
+; WIN64-NEXT:incb %al
+; WIN64-NEXT:movzbl %al, %eax
+; WIN64-NEXT:callq test_argReti1
+; WIN64-NEXT:incb %al
+; WIN64-NEXT:popq %rcx
+; WIN64-NEXT:retq
+; WIN64-NEXT:.seh_endproc
+;
+; LINUXOSX-LABEL: test_CallargReti1:
+; LINUXOSX:   # %bb.0:
+; LINUXOSX-NEXT:pushq %rax
+; LINUXOSX-NEXT:.cfi_def_cfa_offset 16
+; LINUXOSX-NEXT:incb %al
+; LINUXOSX-NEXT:movzbl %al, %eax
+; LINUXOSX-NEXT:callq *test_argReti1@GOTPCREL(%rip)
+; LINUXOSX-NEXT:incb %al
+; LINUXOSX-NEXT:popq %rcx
+; LINUXOSX-NEXT:.cfi_def_cfa_offset 8
+; LINUXOSX-NEXT:retq
+  %b = add i1 %a, 1
+  %c = call x86_regcallcc i1 @test_argReti1(i1 %b)
+  %d = add i1 %c, 1
+  ret i1 %d
+}
+
+;test calling conventions - input parameters, callee saved xmms
+define x86_regcallcc <16 x float> @testf32_inp(<16 x float> %a, <16 x float> %b, <16 x float> %c) nounwind {
+; WIN32-LABEL: testf32_inp:
+; WIN32:   # %bb.0:
+; WIN32-NEXT:pushl %ebp
+; WIN32-NEXT:movl %esp, %ebp
+; WIN32-NEXT:andl $-16, %esp
+; WIN32-NEXT:subl $32, %esp
+; WIN32-NEXT:movaps %xmm7, (%esp) # 16-byte Spill
+; WIN32-NEXT:movaps %xmm6, %xmm7
+; WIN32-NEXT:movaps %xmm5, %xmm6
+; WIN32-NEXT:movaps %xmm3, %xmm5
+; WIN32-NEXT:movaps %xmm2, %xmm3
+; WIN32-NEXT:movaps %xmm1, %xmm2
+; WIN32-NEXT:movaps %xmm0, %xmm1
+; WIN32-NEXT:addps %xmm4, %xmm0
+; WIN32-NEXT:mulps %xmm4, %xmm1
+; WIN32-NEXT:subps %xmm1, %xmm0
+; WIN32-NEXT:movups 8(%ebp), %xmm1
+; WIN32-NEXT:addps %xmm1, %xmm0
+; WIN32-NEXT:movaps %xmm2, %xmm4
+; WIN32-NEXT:addps %xmm6, %xmm4
+; WIN32-NEXT:mulps %xmm6, %xmm2
+; WIN32-NEXT:subps %xmm2, %xmm4
+; WIN32-NEXT:movups 24(%ebp), %xmm1
+; WIN32-NEXT:addps %xmm1, %xmm4
+; WIN32-NEXT:movaps %xmm3, %xmm2
+; WIN32-NEXT:addps %xmm7, %xmm2
+; WIN32-NEXT:mulps %xmm7, %xmm3
+; WIN32-NEXT:subps %xmm3, %xmm2
+; WIN32-NEXT:movups 40(%ebp), %xmm1
+; WIN32-NEXT:addps %xmm1, %xmm2
+; WIN32-NEXT:movaps %xmm5, %xmm3
+; WIN32-NEXT:movaps (%esp), %xmm1 # 16-byte Reload
+; WIN32-NEXT:addps %xmm1, %xmm3
+; WIN32-NEXT:mulps %xmm1, %xmm5
+; WIN32-NEXT:subps %xmm5, %xmm3
+; WIN32-NEXT:movups 56(%ebp), %xmm1
+; WIN32-NEXT:addps %xmm1, %xmm3
+; WIN32-NEXT:movaps %xmm4, %xmm1
+; WIN32-NEXT:movl %ebp, %esp
+; WIN32-NEXT:popl %ebp
+; WIN32-NEXT:retl
+;
+; WIN64-LABEL: testf32_inp:
+; WIN64:   # %bb.0:
+; WIN64-NEXT:subq $72, %rsp
+; WIN64-NEXT:movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; WIN64-NEXT:movaps 

[PATCH] D155863: [X86][Regcall] Add an option to respect regcall ABI v.4 in win64

2023-08-01 Thread Bing Yu via Phabricator via cfe-commits
yubing updated this revision to Diff 545924.
yubing added a comment.

address c++ windows's mangling prefix w=>x


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D155863/new/

https://reviews.llvm.org/D155863

Files:
  clang/include/clang/Basic/LangOptions.def
  clang/include/clang/Driver/Options.td
  clang/lib/AST/ItaniumMangle.cpp
  clang/lib/AST/Mangle.cpp
  clang/lib/AST/MicrosoftMangle.cpp
  clang/lib/CodeGen/CodeGenModule.cpp
  clang/lib/Driver/ToolChains/Clang.cpp
  clang/test/CodeGen/check-regcall4-moduleflag.c
  clang/test/CodeGen/regcall4.c
  clang/test/CodeGenCXX/regcall4.cpp
  clang/test/Driver/cl-cc-flags.c
  llvm/lib/Target/X86/X86CallingConv.td
  llvm/test/CodeGen/X86/sse-regcall4.ll

Index: llvm/test/CodeGen/X86/sse-regcall4.ll
===
--- /dev/null
+++ llvm/test/CodeGen/X86/sse-regcall4.ll
@@ -0,0 +1,467 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-pc-win32 -mattr=+sse | FileCheck --check-prefix=WIN32 %s
+; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+sse | FileCheck --check-prefix=WIN64 %s
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=+sse | FileCheck --check-prefix=LINUXOSX %s
+
+; Test regcall when receiving/returning i1
+define x86_regcallcc i1 @test_argReti1(i1 %a)  {
+; WIN32-LABEL: test_argReti1:
+; WIN32:   # %bb.0:
+; WIN32-NEXT:incb %cl
+; WIN32-NEXT:# kill: def $cl killed $cl killed $ecx
+; WIN32-NEXT:retl
+;
+; WIN64-LABEL: test_argReti1:
+; WIN64:   # %bb.0:
+; WIN64-NEXT:incb %al
+; WIN64-NEXT:# kill: def $al killed $al killed $eax
+; WIN64-NEXT:retq
+;
+; LINUXOSX-LABEL: test_argReti1:
+; LINUXOSX:   # %bb.0:
+; LINUXOSX-NEXT:incb %al
+; LINUXOSX-NEXT:# kill: def $al killed $al killed $eax
+; LINUXOSX-NEXT:retq
+  %add = add i1 %a, 1
+  ret i1 %add
+}
+
+; Test regcall when passing/retrieving i1
+define x86_regcallcc i1 @test_CallargReti1(i1 %a)  {
+; WIN32-LABEL: test_CallargReti1:
+; WIN32:   # %bb.0:
+; WIN32-NEXT:incb %cl
+; WIN32-NEXT:movzbl %cl, %ecx
+; WIN32-NEXT:calll _test_argReti1
+; WIN32-NEXT:incb %cl
+; WIN32-NEXT:retl
+;
+; WIN64-LABEL: test_CallargReti1:
+; WIN64:   # %bb.0:
+; WIN64-NEXT:pushq %rax
+; WIN64-NEXT:.seh_stackalloc 8
+; WIN64-NEXT:.seh_endprologue
+; WIN64-NEXT:incb %al
+; WIN64-NEXT:movzbl %al, %eax
+; WIN64-NEXT:callq test_argReti1
+; WIN64-NEXT:incb %al
+; WIN64-NEXT:popq %rcx
+; WIN64-NEXT:retq
+; WIN64-NEXT:.seh_endproc
+;
+; LINUXOSX-LABEL: test_CallargReti1:
+; LINUXOSX:   # %bb.0:
+; LINUXOSX-NEXT:pushq %rax
+; LINUXOSX-NEXT:.cfi_def_cfa_offset 16
+; LINUXOSX-NEXT:incb %al
+; LINUXOSX-NEXT:movzbl %al, %eax
+; LINUXOSX-NEXT:callq *test_argReti1@GOTPCREL(%rip)
+; LINUXOSX-NEXT:incb %al
+; LINUXOSX-NEXT:popq %rcx
+; LINUXOSX-NEXT:.cfi_def_cfa_offset 8
+; LINUXOSX-NEXT:retq
+  %b = add i1 %a, 1
+  %c = call x86_regcallcc i1 @test_argReti1(i1 %b)
+  %d = add i1 %c, 1
+  ret i1 %d
+}
+
+;test calling conventions - input parameters, callee saved xmms
+define x86_regcallcc <16 x float> @testf32_inp(<16 x float> %a, <16 x float> %b, <16 x float> %c) nounwind {
+; WIN32-LABEL: testf32_inp:
+; WIN32:   # %bb.0:
+; WIN32-NEXT:pushl %ebp
+; WIN32-NEXT:movl %esp, %ebp
+; WIN32-NEXT:andl $-16, %esp
+; WIN32-NEXT:subl $32, %esp
+; WIN32-NEXT:movaps %xmm7, (%esp) # 16-byte Spill
+; WIN32-NEXT:movaps %xmm6, %xmm7
+; WIN32-NEXT:movaps %xmm5, %xmm6
+; WIN32-NEXT:movaps %xmm3, %xmm5
+; WIN32-NEXT:movaps %xmm2, %xmm3
+; WIN32-NEXT:movaps %xmm1, %xmm2
+; WIN32-NEXT:movaps %xmm0, %xmm1
+; WIN32-NEXT:addps %xmm4, %xmm0
+; WIN32-NEXT:mulps %xmm4, %xmm1
+; WIN32-NEXT:subps %xmm1, %xmm0
+; WIN32-NEXT:movups 8(%ebp), %xmm1
+; WIN32-NEXT:addps %xmm1, %xmm0
+; WIN32-NEXT:movaps %xmm2, %xmm4
+; WIN32-NEXT:addps %xmm6, %xmm4
+; WIN32-NEXT:mulps %xmm6, %xmm2
+; WIN32-NEXT:subps %xmm2, %xmm4
+; WIN32-NEXT:movups 24(%ebp), %xmm1
+; WIN32-NEXT:addps %xmm1, %xmm4
+; WIN32-NEXT:movaps %xmm3, %xmm2
+; WIN32-NEXT:addps %xmm7, %xmm2
+; WIN32-NEXT:mulps %xmm7, %xmm3
+; WIN32-NEXT:subps %xmm3, %xmm2
+; WIN32-NEXT:movups 40(%ebp), %xmm1
+; WIN32-NEXT:addps %xmm1, %xmm2
+; WIN32-NEXT:movaps %xmm5, %xmm3
+; WIN32-NEXT:movaps (%esp), %xmm1 # 16-byte Reload
+; WIN32-NEXT:addps %xmm1, %xmm3
+; WIN32-NEXT:mulps %xmm1, %xmm5
+; WIN32-NEXT:subps %xmm5, %xmm3
+; WIN32-NEXT:movups 56(%ebp), %xmm1
+; WIN32-NEXT:addps %xmm1, %xmm3
+; WIN32-NEXT:movaps %xmm4, %xmm1
+; WIN32-NEXT:movl %ebp, %esp
+; WIN32-NEXT:popl %ebp
+; WIN32-NEXT:retl
+;
+; WIN64-LABEL: testf32_inp:
+; WIN64:   # %bb.0:
+; WIN64-NEXT:subq $72, %rsp
+; WIN64-NEXT:movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; WIN64-NEXT: 

[PATCH] D155863: [X86][Regcall] Add an option to respect regcall ABI v.4 in win64

2023-07-24 Thread Bing Yu via Phabricator via cfe-commits
yubing added inline comments.



Comment at: llvm/lib/Target/X86/X86CallingConv.td:98-103
+def RC_X86_64_RegCallv4_Win : RC_X86_64_RegCall {
+  let GPR_8 = [AL, CL, DL, DIL, SIL, R8B, R9B, R11B, R12B, R14B, R15B];
+  let GPR_16 = [AX, CX, DX, DI, SI, R8W, R9W, R11W, R12W, R14W, R15W];
+  let GPR_32 = [EAX, ECX, EDX, EDI, ESI, R8D, R9D, R11D, R12D, R14D, R15D];
+  let GPR_64 = [RAX, RCX, RDX, RDI, RSI, R8, R9, R11, R12, R14, R15];
+}

yubing wrote:
> pengfei wrote:
> > According to the spec, Win64 calling convention is identical to Linux64 on 
> > V4, i.e., both `R10` and `R11` are reserved. I think you can reuse 
> > `RC_X86_64_RegCall_SysV` instead.
> i think spec says the following GP64 are passing/returning value
> RAX, RCX, RDX, RDI, RSI, R8, R9, R10, R11, R12, R14, R15
> REF: 
> https://www.intel.com/content/www/us/en/docs/cpp-compiler/developer-guide-reference/2021-8/c-c-calling-conventions.html#GUID-011A435D-F8D0-46D7-B973-9B704CA5B54E
so R10 and R13 are reserved instead of R10 and R11


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D155863/new/

https://reviews.llvm.org/D155863

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D155863: [X86][Regcall] Add an option to respect regcall ABI v.4 in win64

2023-07-24 Thread Bing Yu via Phabricator via cfe-commits
yubing added inline comments.



Comment at: llvm/lib/Target/X86/X86CallingConv.td:98-103
+def RC_X86_64_RegCallv4_Win : RC_X86_64_RegCall {
+  let GPR_8 = [AL, CL, DL, DIL, SIL, R8B, R9B, R11B, R12B, R14B, R15B];
+  let GPR_16 = [AX, CX, DX, DI, SI, R8W, R9W, R11W, R12W, R14W, R15W];
+  let GPR_32 = [EAX, ECX, EDX, EDI, ESI, R8D, R9D, R11D, R12D, R14D, R15D];
+  let GPR_64 = [RAX, RCX, RDX, RDI, RSI, R8, R9, R11, R12, R14, R15];
+}

pengfei wrote:
> According to the spec, Win64 calling convention is identical to Linux64 on 
> V4, i.e., both `R10` and `R11` are reserved. I think you can reuse 
> `RC_X86_64_RegCall_SysV` instead.
i think spec says the following GP64 are passing/returning value
RAX, RCX, RDX, RDI, RSI, R8, R9, R10, R11, R12, R14, R15
REF: 
https://www.intel.com/content/www/us/en/docs/cpp-compiler/developer-guide-reference/2021-8/c-c-calling-conventions.html#GUID-011A435D-F8D0-46D7-B973-9B704CA5B54E


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D155863/new/

https://reviews.llvm.org/D155863

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D155863: [X86][Regcall] Add an option to respect regcall ABI v.4 in win64

2023-07-24 Thread Bing Yu via Phabricator via cfe-commits
yubing updated this revision to Diff 543437.
yubing added a comment.

small fix


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D155863/new/

https://reviews.llvm.org/D155863

Files:
  clang/include/clang/Basic/LangOptions.def
  clang/include/clang/Driver/Options.td
  clang/lib/AST/ItaniumMangle.cpp
  clang/lib/AST/Mangle.cpp
  clang/lib/CodeGen/CodeGenModule.cpp
  clang/lib/Driver/ToolChains/Clang.cpp
  clang/test/CodeGen/check-regcall4-moduleflag.c
  clang/test/CodeGen/regcall4.c
  clang/test/CodeGenCXX/regcall4.cpp
  clang/test/Driver/cl-cc-flags.c
  llvm/lib/Target/X86/X86CallingConv.td
  llvm/test/CodeGen/X86/sse-regcall4.ll

Index: llvm/test/CodeGen/X86/sse-regcall4.ll
===
--- /dev/null
+++ llvm/test/CodeGen/X86/sse-regcall4.ll
@@ -0,0 +1,467 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-pc-win32 -mattr=+sse | FileCheck --check-prefix=WIN32 %s
+; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+sse | FileCheck --check-prefix=WIN64 %s
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=+sse | FileCheck --check-prefix=LINUXOSX %s
+
+; Test regcall when receiving/returning i1
+define x86_regcallcc i1 @test_argReti1(i1 %a)  {
+; WIN32-LABEL: test_argReti1:
+; WIN32:   # %bb.0:
+; WIN32-NEXT:incb %cl
+; WIN32-NEXT:# kill: def $cl killed $cl killed $ecx
+; WIN32-NEXT:retl
+;
+; WIN64-LABEL: test_argReti1:
+; WIN64:   # %bb.0:
+; WIN64-NEXT:incb %al
+; WIN64-NEXT:# kill: def $al killed $al killed $eax
+; WIN64-NEXT:retq
+;
+; LINUXOSX-LABEL: test_argReti1:
+; LINUXOSX:   # %bb.0:
+; LINUXOSX-NEXT:incb %al
+; LINUXOSX-NEXT:# kill: def $al killed $al killed $eax
+; LINUXOSX-NEXT:retq
+  %add = add i1 %a, 1
+  ret i1 %add
+}
+
+; Test regcall when passing/retrieving i1
+define x86_regcallcc i1 @test_CallargReti1(i1 %a)  {
+; WIN32-LABEL: test_CallargReti1:
+; WIN32:   # %bb.0:
+; WIN32-NEXT:incb %cl
+; WIN32-NEXT:movzbl %cl, %ecx
+; WIN32-NEXT:calll _test_argReti1
+; WIN32-NEXT:incb %cl
+; WIN32-NEXT:retl
+;
+; WIN64-LABEL: test_CallargReti1:
+; WIN64:   # %bb.0:
+; WIN64-NEXT:pushq %rax
+; WIN64-NEXT:.seh_stackalloc 8
+; WIN64-NEXT:.seh_endprologue
+; WIN64-NEXT:incb %al
+; WIN64-NEXT:movzbl %al, %eax
+; WIN64-NEXT:callq test_argReti1
+; WIN64-NEXT:incb %al
+; WIN64-NEXT:popq %rcx
+; WIN64-NEXT:retq
+; WIN64-NEXT:.seh_endproc
+;
+; LINUXOSX-LABEL: test_CallargReti1:
+; LINUXOSX:   # %bb.0:
+; LINUXOSX-NEXT:pushq %rax
+; LINUXOSX-NEXT:.cfi_def_cfa_offset 16
+; LINUXOSX-NEXT:incb %al
+; LINUXOSX-NEXT:movzbl %al, %eax
+; LINUXOSX-NEXT:callq *test_argReti1@GOTPCREL(%rip)
+; LINUXOSX-NEXT:incb %al
+; LINUXOSX-NEXT:popq %rcx
+; LINUXOSX-NEXT:.cfi_def_cfa_offset 8
+; LINUXOSX-NEXT:retq
+  %b = add i1 %a, 1
+  %c = call x86_regcallcc i1 @test_argReti1(i1 %b)
+  %d = add i1 %c, 1
+  ret i1 %d
+}
+
+;test calling conventions - input parameters, callee saved xmms
+define x86_regcallcc <16 x float> @testf32_inp(<16 x float> %a, <16 x float> %b, <16 x float> %c) nounwind {
+; WIN32-LABEL: testf32_inp:
+; WIN32:   # %bb.0:
+; WIN32-NEXT:pushl %ebp
+; WIN32-NEXT:movl %esp, %ebp
+; WIN32-NEXT:andl $-16, %esp
+; WIN32-NEXT:subl $32, %esp
+; WIN32-NEXT:movaps %xmm7, (%esp) # 16-byte Spill
+; WIN32-NEXT:movaps %xmm6, %xmm7
+; WIN32-NEXT:movaps %xmm5, %xmm6
+; WIN32-NEXT:movaps %xmm3, %xmm5
+; WIN32-NEXT:movaps %xmm2, %xmm3
+; WIN32-NEXT:movaps %xmm1, %xmm2
+; WIN32-NEXT:movaps %xmm0, %xmm1
+; WIN32-NEXT:addps %xmm4, %xmm0
+; WIN32-NEXT:mulps %xmm4, %xmm1
+; WIN32-NEXT:subps %xmm1, %xmm0
+; WIN32-NEXT:movups 8(%ebp), %xmm1
+; WIN32-NEXT:addps %xmm1, %xmm0
+; WIN32-NEXT:movaps %xmm2, %xmm4
+; WIN32-NEXT:addps %xmm6, %xmm4
+; WIN32-NEXT:mulps %xmm6, %xmm2
+; WIN32-NEXT:subps %xmm2, %xmm4
+; WIN32-NEXT:movups 24(%ebp), %xmm1
+; WIN32-NEXT:addps %xmm1, %xmm4
+; WIN32-NEXT:movaps %xmm3, %xmm2
+; WIN32-NEXT:addps %xmm7, %xmm2
+; WIN32-NEXT:mulps %xmm7, %xmm3
+; WIN32-NEXT:subps %xmm3, %xmm2
+; WIN32-NEXT:movups 40(%ebp), %xmm1
+; WIN32-NEXT:addps %xmm1, %xmm2
+; WIN32-NEXT:movaps %xmm5, %xmm3
+; WIN32-NEXT:movaps (%esp), %xmm1 # 16-byte Reload
+; WIN32-NEXT:addps %xmm1, %xmm3
+; WIN32-NEXT:mulps %xmm1, %xmm5
+; WIN32-NEXT:subps %xmm5, %xmm3
+; WIN32-NEXT:movups 56(%ebp), %xmm1
+; WIN32-NEXT:addps %xmm1, %xmm3
+; WIN32-NEXT:movaps %xmm4, %xmm1
+; WIN32-NEXT:movl %ebp, %esp
+; WIN32-NEXT:popl %ebp
+; WIN32-NEXT:retl
+;
+; WIN64-LABEL: testf32_inp:
+; WIN64:   # %bb.0:
+; WIN64-NEXT:subq $72, %rsp
+; WIN64-NEXT:movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; WIN64-NEXT:movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; 

[PATCH] D155863: [X86][Regcall] Add an option to respect regcall ABI v.4 in win64

2023-07-24 Thread Bing Yu via Phabricator via cfe-commits
yubing added inline comments.



Comment at: llvm/lib/Target/X86/X86CallingConv.td:468
+defm X86_32_RegCallv4_Win :
+X86_RegCall_base;
 defm X86_Win64_RegCall :

pengfei wrote:
> This will define RetCC_* as well but it is not used, hence will emit warning. 
> Any way to solve it?
yes, in fact retcc should respect regcall4 as well according to 
https://www.intel.com/content/www/us/en/docs/cpp-compiler/developer-guide-reference/2021-8/c-c-calling-conventions.html#GUID-011A435D-F8D0-46D7-B973-9B704CA5B54E


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D155863/new/

https://reviews.llvm.org/D155863

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D155863: [X86][Regcall] Add an option to respect regcall ABI v.4 in win64

2023-07-24 Thread Bing Yu via Phabricator via cfe-commits
yubing updated this revision to Diff 543433.
yubing added a comment.

make retcc respect regcall ABI v.4 as well


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D155863/new/

https://reviews.llvm.org/D155863

Files:
  clang/include/clang/Basic/LangOptions.def
  clang/include/clang/Driver/Options.td
  clang/lib/AST/ItaniumMangle.cpp
  clang/lib/AST/Mangle.cpp
  clang/lib/CodeGen/CodeGenModule.cpp
  clang/lib/Driver/ToolChains/Clang.cpp
  clang/test/CodeGen/check-regcall4-moduleflag.c
  clang/test/CodeGen/regcall4.c
  clang/test/CodeGenCXX/regcall4.cpp
  clang/test/Driver/cl-cc-flags.c
  llvm/lib/Target/X86/X86CallingConv.td
  llvm/test/CodeGen/X86/sse-regcall4.ll

Index: llvm/test/CodeGen/X86/sse-regcall4.ll
===
--- /dev/null
+++ llvm/test/CodeGen/X86/sse-regcall4.ll
@@ -0,0 +1,467 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-pc-win32 -mattr=+sse | FileCheck --check-prefix=WIN32 %s
+; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+sse | FileCheck --check-prefix=WIN64 %s
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=+sse | FileCheck --check-prefix=LINUXOSX %s
+
+; Test regcall when receiving/returning i1
+define x86_regcallcc i1 @test_argReti1(i1 %a)  {
+; WIN32-LABEL: test_argReti1:
+; WIN32:   # %bb.0:
+; WIN32-NEXT:incb %cl
+; WIN32-NEXT:# kill: def $cl killed $cl killed $ecx
+; WIN32-NEXT:retl
+;
+; WIN64-LABEL: test_argReti1:
+; WIN64:   # %bb.0:
+; WIN64-NEXT:incb %al
+; WIN64-NEXT:# kill: def $al killed $al killed $eax
+; WIN64-NEXT:retq
+;
+; LINUXOSX-LABEL: test_argReti1:
+; LINUXOSX:   # %bb.0:
+; LINUXOSX-NEXT:incb %al
+; LINUXOSX-NEXT:# kill: def $al killed $al killed $eax
+; LINUXOSX-NEXT:retq
+  %add = add i1 %a, 1
+  ret i1 %add
+}
+
+; Test regcall when passing/retrieving i1
+define x86_regcallcc i1 @test_CallargReti1(i1 %a)  {
+; WIN32-LABEL: test_CallargReti1:
+; WIN32:   # %bb.0:
+; WIN32-NEXT:incb %cl
+; WIN32-NEXT:movzbl %cl, %ecx
+; WIN32-NEXT:calll _test_argReti1
+; WIN32-NEXT:incb %cl
+; WIN32-NEXT:retl
+;
+; WIN64-LABEL: test_CallargReti1:
+; WIN64:   # %bb.0:
+; WIN64-NEXT:pushq %rax
+; WIN64-NEXT:.seh_stackalloc 8
+; WIN64-NEXT:.seh_endprologue
+; WIN64-NEXT:incb %al
+; WIN64-NEXT:movzbl %al, %eax
+; WIN64-NEXT:callq test_argReti1
+; WIN64-NEXT:incb %al
+; WIN64-NEXT:popq %rcx
+; WIN64-NEXT:retq
+; WIN64-NEXT:.seh_endproc
+;
+; LINUXOSX-LABEL: test_CallargReti1:
+; LINUXOSX:   # %bb.0:
+; LINUXOSX-NEXT:pushq %rax
+; LINUXOSX-NEXT:.cfi_def_cfa_offset 16
+; LINUXOSX-NEXT:incb %al
+; LINUXOSX-NEXT:movzbl %al, %eax
+; LINUXOSX-NEXT:callq *test_argReti1@GOTPCREL(%rip)
+; LINUXOSX-NEXT:incb %al
+; LINUXOSX-NEXT:popq %rcx
+; LINUXOSX-NEXT:.cfi_def_cfa_offset 8
+; LINUXOSX-NEXT:retq
+  %b = add i1 %a, 1
+  %c = call x86_regcallcc i1 @test_argReti1(i1 %b)
+  %d = add i1 %c, 1
+  ret i1 %d
+}
+
+;test calling conventions - input parameters, callee saved xmms
+define x86_regcallcc <16 x float> @testf32_inp(<16 x float> %a, <16 x float> %b, <16 x float> %c) nounwind {
+; WIN32-LABEL: testf32_inp:
+; WIN32:   # %bb.0:
+; WIN32-NEXT:pushl %ebp
+; WIN32-NEXT:movl %esp, %ebp
+; WIN32-NEXT:andl $-16, %esp
+; WIN32-NEXT:subl $32, %esp
+; WIN32-NEXT:movaps %xmm7, (%esp) # 16-byte Spill
+; WIN32-NEXT:movaps %xmm6, %xmm7
+; WIN32-NEXT:movaps %xmm5, %xmm6
+; WIN32-NEXT:movaps %xmm3, %xmm5
+; WIN32-NEXT:movaps %xmm2, %xmm3
+; WIN32-NEXT:movaps %xmm1, %xmm2
+; WIN32-NEXT:movaps %xmm0, %xmm1
+; WIN32-NEXT:addps %xmm4, %xmm0
+; WIN32-NEXT:mulps %xmm4, %xmm1
+; WIN32-NEXT:subps %xmm1, %xmm0
+; WIN32-NEXT:movups 8(%ebp), %xmm1
+; WIN32-NEXT:addps %xmm1, %xmm0
+; WIN32-NEXT:movaps %xmm2, %xmm4
+; WIN32-NEXT:addps %xmm6, %xmm4
+; WIN32-NEXT:mulps %xmm6, %xmm2
+; WIN32-NEXT:subps %xmm2, %xmm4
+; WIN32-NEXT:movups 24(%ebp), %xmm1
+; WIN32-NEXT:addps %xmm1, %xmm4
+; WIN32-NEXT:movaps %xmm3, %xmm2
+; WIN32-NEXT:addps %xmm7, %xmm2
+; WIN32-NEXT:mulps %xmm7, %xmm3
+; WIN32-NEXT:subps %xmm3, %xmm2
+; WIN32-NEXT:movups 40(%ebp), %xmm1
+; WIN32-NEXT:addps %xmm1, %xmm2
+; WIN32-NEXT:movaps %xmm5, %xmm3
+; WIN32-NEXT:movaps (%esp), %xmm1 # 16-byte Reload
+; WIN32-NEXT:addps %xmm1, %xmm3
+; WIN32-NEXT:mulps %xmm1, %xmm5
+; WIN32-NEXT:subps %xmm5, %xmm3
+; WIN32-NEXT:movups 56(%ebp), %xmm1
+; WIN32-NEXT:addps %xmm1, %xmm3
+; WIN32-NEXT:movaps %xmm4, %xmm1
+; WIN32-NEXT:movl %ebp, %esp
+; WIN32-NEXT:popl %ebp
+; WIN32-NEXT:retl
+;
+; WIN64-LABEL: testf32_inp:
+; WIN64:   # %bb.0:
+; WIN64-NEXT:subq $72, %rsp
+; WIN64-NEXT:movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; WIN64-NEXT:movaps %xmm14, 

[PATCH] D155863: [X86][Regcall] Add an option to respect regcall ABI v.4 in win64

2023-07-20 Thread Bing Yu via Phabricator via cfe-commits
yubing added a comment.

ping?


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D155863/new/

https://reviews.llvm.org/D155863

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D155863: [X86][Regcall] Add an option to respect regcall ABI v.4 in win64

2023-07-20 Thread Bing Yu via Phabricator via cfe-commits
yubing updated this revision to Diff 542575.
yubing added a comment.

small fix


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D155863/new/

https://reviews.llvm.org/D155863

Files:
  clang/include/clang/Basic/LangOptions.def
  clang/include/clang/Driver/Options.td
  clang/lib/AST/ItaniumMangle.cpp
  clang/lib/AST/Mangle.cpp
  clang/lib/CodeGen/CodeGenModule.cpp
  clang/lib/Driver/ToolChains/Clang.cpp
  clang/test/CodeGen/check-regcall4-moduleflag.c
  clang/test/CodeGen/regcall4.c
  clang/test/CodeGenCXX/regcall4.cpp
  clang/test/Driver/cl-cc-flags.c
  llvm/lib/Target/X86/X86CallingConv.td
  llvm/test/CodeGen/X86/sse-regcall4.ll

Index: llvm/test/CodeGen/X86/sse-regcall4.ll
===
--- /dev/null
+++ llvm/test/CodeGen/X86/sse-regcall4.ll
@@ -0,0 +1,462 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-pc-win32 -mattr=+sse | FileCheck --check-prefix=WIN32 %s
+; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+sse | FileCheck --check-prefix=WIN64 %s
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=+sse | FileCheck --check-prefix=LINUXOSX %s
+
+; Test regcall when receiving/returning i1
+define x86_regcallcc i1 @test_argReti1(i1 %a)  {
+; WIN32-LABEL: test_argReti1:
+; WIN32:   # %bb.0:
+; WIN32-NEXT:movl %ecx, %eax
+; WIN32-NEXT:incb %al
+; WIN32-NEXT:# kill: def $al killed $al killed $eax
+; WIN32-NEXT:retl
+;
+; WIN64-LABEL: test_argReti1:
+; WIN64:   # %bb.0:
+; WIN64-NEXT:incb %al
+; WIN64-NEXT:# kill: def $al killed $al killed $eax
+; WIN64-NEXT:retq
+;
+; LINUXOSX-LABEL: test_argReti1:
+; LINUXOSX:   # %bb.0:
+; LINUXOSX-NEXT:incb %al
+; LINUXOSX-NEXT:# kill: def $al killed $al killed $eax
+; LINUXOSX-NEXT:retq
+  %add = add i1 %a, 1
+  ret i1 %add
+}
+
+; Test regcall when passing/retrieving i1
+define x86_regcallcc i1 @test_CallargReti1(i1 %a)  {
+; WIN32-LABEL: test_CallargReti1:
+; WIN32:   # %bb.0:
+; WIN32-NEXT:incb %cl
+; WIN32-NEXT:movzbl %cl, %ecx
+; WIN32-NEXT:calll _test_argReti1
+; WIN32-NEXT:incb %al
+; WIN32-NEXT:retl
+;
+; WIN64-LABEL: test_CallargReti1:
+; WIN64:   # %bb.0:
+; WIN64-NEXT:pushq %rax
+; WIN64-NEXT:.seh_stackalloc 8
+; WIN64-NEXT:.seh_endprologue
+; WIN64-NEXT:incb %al
+; WIN64-NEXT:movzbl %al, %eax
+; WIN64-NEXT:callq test_argReti1
+; WIN64-NEXT:incb %al
+; WIN64-NEXT:popq %rcx
+; WIN64-NEXT:retq
+; WIN64-NEXT:.seh_endproc
+;
+; LINUXOSX-LABEL: test_CallargReti1:
+; LINUXOSX:   # %bb.0:
+; LINUXOSX-NEXT:pushq %rax
+; LINUXOSX-NEXT:.cfi_def_cfa_offset 16
+; LINUXOSX-NEXT:incb %al
+; LINUXOSX-NEXT:movzbl %al, %eax
+; LINUXOSX-NEXT:callq *test_argReti1@GOTPCREL(%rip)
+; LINUXOSX-NEXT:incb %al
+; LINUXOSX-NEXT:popq %rcx
+; LINUXOSX-NEXT:.cfi_def_cfa_offset 8
+; LINUXOSX-NEXT:retq
+  %b = add i1 %a, 1
+  %c = call x86_regcallcc i1 @test_argReti1(i1 %b)
+  %d = add i1 %c, 1
+  ret i1 %d
+}
+
+;test calling conventions - input parameters, callee saved xmms
+define x86_regcallcc <16 x float> @testf32_inp(<16 x float> %a, <16 x float> %b, <16 x float> %c) nounwind {
+; WIN32-LABEL: testf32_inp:
+; WIN32:   # %bb.0:
+; WIN32-NEXT:pushl %ebp
+; WIN32-NEXT:movl %esp, %ebp
+; WIN32-NEXT:andl $-16, %esp
+; WIN32-NEXT:subl $32, %esp
+; WIN32-NEXT:movaps %xmm7, (%esp) # 16-byte Spill
+; WIN32-NEXT:movaps %xmm6, %xmm7
+; WIN32-NEXT:movaps %xmm5, %xmm6
+; WIN32-NEXT:movaps %xmm3, %xmm5
+; WIN32-NEXT:movaps %xmm2, %xmm3
+; WIN32-NEXT:movaps %xmm1, %xmm2
+; WIN32-NEXT:movaps %xmm0, %xmm1
+; WIN32-NEXT:addps %xmm4, %xmm0
+; WIN32-NEXT:mulps %xmm4, %xmm1
+; WIN32-NEXT:subps %xmm1, %xmm0
+; WIN32-NEXT:movups 8(%ebp), %xmm1
+; WIN32-NEXT:addps %xmm1, %xmm0
+; WIN32-NEXT:movaps %xmm2, %xmm4
+; WIN32-NEXT:addps %xmm6, %xmm4
+; WIN32-NEXT:mulps %xmm6, %xmm2
+; WIN32-NEXT:subps %xmm2, %xmm4
+; WIN32-NEXT:movups 24(%ebp), %xmm1
+; WIN32-NEXT:addps %xmm1, %xmm4
+; WIN32-NEXT:movaps %xmm3, %xmm2
+; WIN32-NEXT:addps %xmm7, %xmm2
+; WIN32-NEXT:mulps %xmm7, %xmm3
+; WIN32-NEXT:subps %xmm3, %xmm2
+; WIN32-NEXT:movups 40(%ebp), %xmm1
+; WIN32-NEXT:addps %xmm1, %xmm2
+; WIN32-NEXT:movaps %xmm5, %xmm3
+; WIN32-NEXT:movaps (%esp), %xmm1 # 16-byte Reload
+; WIN32-NEXT:addps %xmm1, %xmm3
+; WIN32-NEXT:mulps %xmm1, %xmm5
+; WIN32-NEXT:subps %xmm5, %xmm3
+; WIN32-NEXT:movups 56(%ebp), %xmm1
+; WIN32-NEXT:addps %xmm1, %xmm3
+; WIN32-NEXT:movaps %xmm4, %xmm1
+; WIN32-NEXT:movl %ebp, %esp
+; WIN32-NEXT:popl %ebp
+; WIN32-NEXT:retl
+;
+; WIN64-LABEL: testf32_inp:
+; WIN64:   # %bb.0:
+; WIN64-NEXT:subq $72, %rsp
+; WIN64-NEXT:movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; WIN64-NEXT:movaps %xmm14, 

[PATCH] D155863: [X86][Regcall] Add an option to respect regcall ABI v.4 in win64

2023-07-20 Thread Bing Yu via Phabricator via cfe-commits
yubing created this revision.
Herald added subscribers: pengfei, hiraditya.
Herald added a project: All.
yubing requested review of this revision.
Herald added subscribers: llvm-commits, cfe-commits, MaskRay.
Herald added projects: clang, LLVM.

Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D155863

Files:
  clang/include/clang/Basic/LangOptions.def
  clang/include/clang/Driver/Options.td
  clang/lib/AST/ItaniumMangle.cpp
  clang/lib/AST/Mangle.cpp
  clang/lib/CodeGen/CodeGenModule.cpp
  clang/lib/Driver/ToolChains/Clang.cpp
  clang/test/CodeGen/check-regcall4-moduleflag.c
  clang/test/CodeGen/regcall4.c
  clang/test/CodeGenCXX/regcall4.cpp
  clang/test/Driver/cl-cc-flags.c
  llvm/lib/Target/X86/X86CallingConv.td
  llvm/test/CodeGen/X86/sse-regcall4.ll

Index: llvm/test/CodeGen/X86/sse-regcall4.ll
===
--- /dev/null
+++ llvm/test/CodeGen/X86/sse-regcall4.ll
@@ -0,0 +1,462 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-pc-win32 -mattr=+sse | FileCheck --check-prefix=WIN32 %s
+; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+sse | FileCheck --check-prefix=WIN64 %s
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=+sse | FileCheck --check-prefix=LINUXOSX %s
+
+; Test regcall when receiving/returning i1
+define x86_regcallcc i1 @test_argReti1(i1 %a)  {
+; WIN32-LABEL: test_argReti1:
+; WIN32:   # %bb.0:
+; WIN32-NEXT:movl %ecx, %eax
+; WIN32-NEXT:incb %al
+; WIN32-NEXT:# kill: def $al killed $al killed $eax
+; WIN32-NEXT:retl
+;
+; WIN64-LABEL: test_argReti1:
+; WIN64:   # %bb.0:
+; WIN64-NEXT:incb %al
+; WIN64-NEXT:# kill: def $al killed $al killed $eax
+; WIN64-NEXT:retq
+;
+; LINUXOSX-LABEL: test_argReti1:
+; LINUXOSX:   # %bb.0:
+; LINUXOSX-NEXT:incb %al
+; LINUXOSX-NEXT:# kill: def $al killed $al killed $eax
+; LINUXOSX-NEXT:retq
+  %add = add i1 %a, 1
+  ret i1 %add
+}
+
+; Test regcall when passing/retrieving i1
+define x86_regcallcc i1 @test_CallargReti1(i1 %a)  {
+; WIN32-LABEL: test_CallargReti1:
+; WIN32:   # %bb.0:
+; WIN32-NEXT:incb %cl
+; WIN32-NEXT:movzbl %cl, %ecx
+; WIN32-NEXT:calll _test_argReti1
+; WIN32-NEXT:incb %al
+; WIN32-NEXT:retl
+;
+; WIN64-LABEL: test_CallargReti1:
+; WIN64:   # %bb.0:
+; WIN64-NEXT:pushq %rax
+; WIN64-NEXT:.seh_stackalloc 8
+; WIN64-NEXT:.seh_endprologue
+; WIN64-NEXT:incb %al
+; WIN64-NEXT:movzbl %al, %eax
+; WIN64-NEXT:callq test_argReti1
+; WIN64-NEXT:incb %al
+; WIN64-NEXT:popq %rcx
+; WIN64-NEXT:retq
+; WIN64-NEXT:.seh_endproc
+;
+; LINUXOSX-LABEL: test_CallargReti1:
+; LINUXOSX:   # %bb.0:
+; LINUXOSX-NEXT:pushq %rax
+; LINUXOSX-NEXT:.cfi_def_cfa_offset 16
+; LINUXOSX-NEXT:incb %al
+; LINUXOSX-NEXT:movzbl %al, %eax
+; LINUXOSX-NEXT:callq *test_argReti1@GOTPCREL(%rip)
+; LINUXOSX-NEXT:incb %al
+; LINUXOSX-NEXT:popq %rcx
+; LINUXOSX-NEXT:.cfi_def_cfa_offset 8
+; LINUXOSX-NEXT:retq
+  %b = add i1 %a, 1
+  %c = call x86_regcallcc i1 @test_argReti1(i1 %b)
+  %d = add i1 %c, 1
+  ret i1 %d
+}
+
+;test calling conventions - input parameters, callee saved xmms
+define x86_regcallcc <16 x float> @testf32_inp(<16 x float> %a, <16 x float> %b, <16 x float> %c) nounwind {
+; WIN32-LABEL: testf32_inp:
+; WIN32:   # %bb.0:
+; WIN32-NEXT:pushl %ebp
+; WIN32-NEXT:movl %esp, %ebp
+; WIN32-NEXT:andl $-16, %esp
+; WIN32-NEXT:subl $32, %esp
+; WIN32-NEXT:movaps %xmm7, (%esp) # 16-byte Spill
+; WIN32-NEXT:movaps %xmm6, %xmm7
+; WIN32-NEXT:movaps %xmm5, %xmm6
+; WIN32-NEXT:movaps %xmm3, %xmm5
+; WIN32-NEXT:movaps %xmm2, %xmm3
+; WIN32-NEXT:movaps %xmm1, %xmm2
+; WIN32-NEXT:movaps %xmm0, %xmm1
+; WIN32-NEXT:addps %xmm4, %xmm0
+; WIN32-NEXT:mulps %xmm4, %xmm1
+; WIN32-NEXT:subps %xmm1, %xmm0
+; WIN32-NEXT:movups 8(%ebp), %xmm1
+; WIN32-NEXT:addps %xmm1, %xmm0
+; WIN32-NEXT:movaps %xmm2, %xmm4
+; WIN32-NEXT:addps %xmm6, %xmm4
+; WIN32-NEXT:mulps %xmm6, %xmm2
+; WIN32-NEXT:subps %xmm2, %xmm4
+; WIN32-NEXT:movups 24(%ebp), %xmm1
+; WIN32-NEXT:addps %xmm1, %xmm4
+; WIN32-NEXT:movaps %xmm3, %xmm2
+; WIN32-NEXT:addps %xmm7, %xmm2
+; WIN32-NEXT:mulps %xmm7, %xmm3
+; WIN32-NEXT:subps %xmm3, %xmm2
+; WIN32-NEXT:movups 40(%ebp), %xmm1
+; WIN32-NEXT:addps %xmm1, %xmm2
+; WIN32-NEXT:movaps %xmm5, %xmm3
+; WIN32-NEXT:movaps (%esp), %xmm1 # 16-byte Reload
+; WIN32-NEXT:addps %xmm1, %xmm3
+; WIN32-NEXT:mulps %xmm1, %xmm5
+; WIN32-NEXT:subps %xmm5, %xmm3
+; WIN32-NEXT:movups 56(%ebp), %xmm1
+; WIN32-NEXT:addps %xmm1, %xmm3
+; WIN32-NEXT:movaps %xmm4, %xmm1
+; WIN32-NEXT:movl %ebp, %esp
+; WIN32-NEXT:popl %ebp
+; WIN32-NEXT:retl
+;
+; WIN64-LABEL: testf32_inp:
+; WIN64:   # %bb.0:
+; WIN64-NEXT:subq $72, %rsp
+; WIN64-NEXT:movaps 

[PATCH] D132141: [X86] Emulate _rdrand64_step with two rdrand32 if it is 32bit

2022-08-23 Thread Bing Yu via Phabricator via cfe-commits
This revision was landed with ongoing or failed builds.
This revision was automatically updated to reflect the committed changes.
Closed by commit rG6d8ddf53cc80: [X86] Emulate _rdrand64_step with two rdrand32 
if it is 32bit (authored by yubing).

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D132141/new/

https://reviews.llvm.org/D132141

Files:
  clang/lib/Headers/immintrin.h
  clang/test/CodeGen/X86/rdrand-builtins.c

Index: clang/test/CodeGen/X86/rdrand-builtins.c
===
--- clang/test/CodeGen/X86/rdrand-builtins.c
+++ clang/test/CodeGen/X86/rdrand-builtins.c
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -no-opaque-pointers -ffreestanding %s -triple=x86_64-unknown-unknown -target-feature +rdrnd -target-feature +rdseed -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK,X64
-// RUN: %clang_cc1 -no-opaque-pointers -ffreestanding %s -triple=i386-unknown-unknown -target-feature +rdrnd -target-feature +rdseed -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -no-opaque-pointers -ffreestanding %s -triple=i386-unknown-unknown -target-feature +rdrnd -target-feature +rdseed -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK,X86
 
 #include 
 
@@ -17,14 +17,61 @@
 // CHECK: store i32
 }
 
-#if __x86_64__
 int rdrand64(unsigned long long *p) {
   return _rdrand64_step(p);
 // X64: @rdrand64
 // X64: call { i64, i32 } @llvm.x86.rdrand.64
 // X64: store i64
+
+// X86-LABEL: @rdrand64(
+// X86-NEXT:  entry:
+// X86-NEXT:[[RETVAL_I:%.*]] = alloca i32, align 4
+// X86-NEXT:[[__P_ADDR_I:%.*]] = alloca i64*, align 4
+// X86-NEXT:[[__LO_I:%.*]] = alloca i32, align 4
+// X86-NEXT:[[__HI_I:%.*]] = alloca i32, align 4
+// X86-NEXT:[[__RES_LO_I:%.*]] = alloca i32, align 4
+// X86-NEXT:[[__RES_HI_I:%.*]] = alloca i32, align 4
+// X86-NEXT:[[P_ADDR:%.*]] = alloca i64*, align 4
+// X86-NEXT:store i64* [[P:%.*]], i64** [[P_ADDR]], align 4
+// X86-NEXT:[[TMP0:%.*]] = load i64*, i64** [[P_ADDR]], align 4
+// X86-NEXT:store i64* [[TMP0]], i64** [[__P_ADDR_I]], align 4
+// X86-NEXT:[[TMP1:%.*]] = call { i32, i32 } @llvm.x86.rdrand.32()
+// X86-NEXT:[[TMP2:%.*]] = extractvalue { i32, i32 } [[TMP1]], 0
+// X86-NEXT:store i32 [[TMP2]], i32* [[__LO_I]], align 4
+// X86-NEXT:[[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP1]], 1
+// X86-NEXT:store i32 [[TMP3]], i32* [[__RES_LO_I]], align 4
+// X86-NEXT:[[TMP4:%.*]] = call { i32, i32 } @llvm.x86.rdrand.32()
+// X86-NEXT:[[TMP5:%.*]] = extractvalue { i32, i32 } [[TMP4]], 0
+// X86-NEXT:store i32 [[TMP5]], i32* [[__HI_I]], align 4
+// X86-NEXT:[[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP4]], 1
+// X86-NEXT:store i32 [[TMP6]], i32* [[__RES_HI_I]], align 4
+// X86-NEXT:[[TMP7:%.*]] = load i32, i32* [[__RES_LO_I]], align 4
+// X86-NEXT:[[TOBOOL_I:%.*]] = icmp ne i32 [[TMP7]], 0
+// X86-NEXT:br i1 [[TOBOOL_I]], label [[LAND_LHS_TRUE_I:%.*]], label [[IF_ELSE_I:%.*]]
+// X86:   land.lhs.true.i:
+// X86-NEXT:[[TMP8:%.*]] = load i32, i32* [[__RES_HI_I]], align 4
+// X86-NEXT:[[TOBOOL1_I:%.*]] = icmp ne i32 [[TMP8]], 0
+// X86-NEXT:br i1 [[TOBOOL1_I]], label [[IF_THEN_I:%.*]], label [[IF_ELSE_I]]
+// X86:   if.then.i:
+// X86-NEXT:[[TMP9:%.*]] = load i32, i32* [[__HI_I]], align 4
+// X86-NEXT:[[CONV_I:%.*]] = zext i32 [[TMP9]] to i64
+// X86-NEXT:[[SHL_I:%.*]] = shl i64 [[CONV_I]], 32
+// X86-NEXT:[[TMP10:%.*]] = load i32, i32* [[__LO_I]], align 4
+// X86-NEXT:[[CONV2_I:%.*]] = zext i32 [[TMP10]] to i64
+// X86-NEXT:[[OR_I:%.*]] = or i64 [[SHL_I]], [[CONV2_I]]
+// X86-NEXT:[[TMP11:%.*]] = load i64*, i64** [[__P_ADDR_I]], align 4
+// X86-NEXT:store i64 [[OR_I]], i64* [[TMP11]], align 4
+// X86-NEXT:store i32 1, i32* [[RETVAL_I]], align 4
+// X86-NEXT:br label [[_RDRAND64_STEP_EXIT:%.*]]
+// X86:   if.else.i:
+// X86-NEXT:[[TMP12:%.*]] = load i64*, i64** [[__P_ADDR_I]], align 4
+// X86-NEXT:store i64 0, i64* [[TMP12]], align 4
+// X86-NEXT:store i32 0, i32* [[RETVAL_I]], align 4
+// X86-NEXT:br label [[_RDRAND64_STEP_EXIT]]
+// X86:   _rdrand64_step.exit:
+// X86-NEXT:[[TMP13:%.*]] = load i32, i32* [[RETVAL_I]], align 4
+// X86-NEXT:ret i32 [[TMP13]]
 }
-#endif
 
 int rdseed16(unsigned short *p) {
   return _rdseed16_step(p);
Index: clang/lib/Headers/immintrin.h
===
--- clang/lib/Headers/immintrin.h
+++ clang/lib/Headers/immintrin.h
@@ -287,6 +287,23 @@
 {
   return (int)__builtin_ia32_rdrand64_step(__p);
 }
+#else
+// We need to emulate the functionality of 64-bit rdrand with 2 32-bit
+// rdrand instructions.
+static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
+_rdrand64_step(unsigned long long *__p)
+{
+  unsigned int __lo, __hi;
+  unsigned int 

[PATCH] D132141: [X86] Emulate _rdrand64_step with two rdrand32 if it is 32bit

2022-08-23 Thread Bing Yu via Phabricator via cfe-commits
yubing updated this revision to Diff 455041.
yubing added a comment.

address sign-conversion issue


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D132141/new/

https://reviews.llvm.org/D132141

Files:
  clang/lib/Headers/immintrin.h
  clang/test/CodeGen/X86/rdrand-builtins.c

Index: clang/test/CodeGen/X86/rdrand-builtins.c
===
--- clang/test/CodeGen/X86/rdrand-builtins.c
+++ clang/test/CodeGen/X86/rdrand-builtins.c
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -no-opaque-pointers -ffreestanding %s -triple=x86_64-unknown-unknown -target-feature +rdrnd -target-feature +rdseed -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK,X64
-// RUN: %clang_cc1 -no-opaque-pointers -ffreestanding %s -triple=i386-unknown-unknown -target-feature +rdrnd -target-feature +rdseed -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -no-opaque-pointers -ffreestanding %s -triple=i386-unknown-unknown -target-feature +rdrnd -target-feature +rdseed -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK,X86
 
 #include 
 
@@ -17,14 +17,61 @@
 // CHECK: store i32
 }
 
-#if __x86_64__
 int rdrand64(unsigned long long *p) {
   return _rdrand64_step(p);
 // X64: @rdrand64
 // X64: call { i64, i32 } @llvm.x86.rdrand.64
 // X64: store i64
+
+// X86-LABEL: @rdrand64(
+// X86-NEXT:  entry:
+// X86-NEXT:[[RETVAL_I:%.*]] = alloca i32, align 4
+// X86-NEXT:[[__P_ADDR_I:%.*]] = alloca i64*, align 4
+// X86-NEXT:[[__LO_I:%.*]] = alloca i32, align 4
+// X86-NEXT:[[__HI_I:%.*]] = alloca i32, align 4
+// X86-NEXT:[[__RES_LO_I:%.*]] = alloca i32, align 4
+// X86-NEXT:[[__RES_HI_I:%.*]] = alloca i32, align 4
+// X86-NEXT:[[P_ADDR:%.*]] = alloca i64*, align 4
+// X86-NEXT:store i64* [[P:%.*]], i64** [[P_ADDR]], align 4
+// X86-NEXT:[[TMP0:%.*]] = load i64*, i64** [[P_ADDR]], align 4
+// X86-NEXT:store i64* [[TMP0]], i64** [[__P_ADDR_I]], align 4
+// X86-NEXT:[[TMP1:%.*]] = call { i32, i32 } @llvm.x86.rdrand.32()
+// X86-NEXT:[[TMP2:%.*]] = extractvalue { i32, i32 } [[TMP1]], 0
+// X86-NEXT:store i32 [[TMP2]], i32* [[__LO_I]], align 4
+// X86-NEXT:[[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP1]], 1
+// X86-NEXT:store i32 [[TMP3]], i32* [[__RES_LO_I]], align 4
+// X86-NEXT:[[TMP4:%.*]] = call { i32, i32 } @llvm.x86.rdrand.32()
+// X86-NEXT:[[TMP5:%.*]] = extractvalue { i32, i32 } [[TMP4]], 0
+// X86-NEXT:store i32 [[TMP5]], i32* [[__HI_I]], align 4
+// X86-NEXT:[[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP4]], 1
+// X86-NEXT:store i32 [[TMP6]], i32* [[__RES_HI_I]], align 4
+// X86-NEXT:[[TMP7:%.*]] = load i32, i32* [[__RES_LO_I]], align 4
+// X86-NEXT:[[TOBOOL_I:%.*]] = icmp ne i32 [[TMP7]], 0
+// X86-NEXT:br i1 [[TOBOOL_I]], label [[LAND_LHS_TRUE_I:%.*]], label [[IF_ELSE_I:%.*]]
+// X86:   land.lhs.true.i:
+// X86-NEXT:[[TMP8:%.*]] = load i32, i32* [[__RES_HI_I]], align 4
+// X86-NEXT:[[TOBOOL1_I:%.*]] = icmp ne i32 [[TMP8]], 0
+// X86-NEXT:br i1 [[TOBOOL1_I]], label [[IF_THEN_I:%.*]], label [[IF_ELSE_I]]
+// X86:   if.then.i:
+// X86-NEXT:[[TMP9:%.*]] = load i32, i32* [[__HI_I]], align 4
+// X86-NEXT:[[CONV_I:%.*]] = zext i32 [[TMP9]] to i64
+// X86-NEXT:[[SHL_I:%.*]] = shl i64 [[CONV_I]], 32
+// X86-NEXT:[[TMP10:%.*]] = load i32, i32* [[__LO_I]], align 4
+// X86-NEXT:[[CONV2_I:%.*]] = zext i32 [[TMP10]] to i64
+// X86-NEXT:[[OR_I:%.*]] = or i64 [[SHL_I]], [[CONV2_I]]
+// X86-NEXT:[[TMP11:%.*]] = load i64*, i64** [[__P_ADDR_I]], align 4
+// X86-NEXT:store i64 [[OR_I]], i64* [[TMP11]], align 4
+// X86-NEXT:store i32 1, i32* [[RETVAL_I]], align 4
+// X86-NEXT:br label [[_RDRAND64_STEP_EXIT:%.*]]
+// X86:   if.else.i:
+// X86-NEXT:[[TMP12:%.*]] = load i64*, i64** [[__P_ADDR_I]], align 4
+// X86-NEXT:store i64 0, i64* [[TMP12]], align 4
+// X86-NEXT:store i32 0, i32* [[RETVAL_I]], align 4
+// X86-NEXT:br label [[_RDRAND64_STEP_EXIT]]
+// X86:   _rdrand64_step.exit:
+// X86-NEXT:[[TMP13:%.*]] = load i32, i32* [[RETVAL_I]], align 4
+// X86-NEXT:ret i32 [[TMP13]]
 }
-#endif
 
 int rdseed16(unsigned short *p) {
   return _rdseed16_step(p);
Index: clang/lib/Headers/immintrin.h
===
--- clang/lib/Headers/immintrin.h
+++ clang/lib/Headers/immintrin.h
@@ -287,6 +287,23 @@
 {
   return (int)__builtin_ia32_rdrand64_step(__p);
 }
+#else
+// We need to emulate the functionality of 64-bit rdrand with 2 32-bit
+// rdrand instructions.
+static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
+_rdrand64_step(unsigned long long *__p)
+{
+  unsigned int __lo, __hi;
+  unsigned int __res_lo = __builtin_ia32_rdrand32_step(&__lo);
+  unsigned int __res_hi = __builtin_ia32_rdrand32_step(&__hi);
+  if (__res_lo && __res_hi) {
+

[PATCH] D132141: [X86] Emulate _rdrand64_step with two rdrand32 if it is 32bit

2022-08-23 Thread Bing Yu via Phabricator via cfe-commits
This revision was landed with ongoing or failed builds.
This revision was automatically updated to reflect the committed changes.
Closed by commit rG07e34763b027: [X86] Emulate _rdrand64_step with two rdrand32 
if it is 32bit (authored by yubing).

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D132141/new/

https://reviews.llvm.org/D132141

Files:
  clang/lib/Headers/immintrin.h
  clang/test/CodeGen/X86/rdrand-builtins.c

Index: clang/test/CodeGen/X86/rdrand-builtins.c
===
--- clang/test/CodeGen/X86/rdrand-builtins.c
+++ clang/test/CodeGen/X86/rdrand-builtins.c
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -no-opaque-pointers -ffreestanding %s -triple=x86_64-unknown-unknown -target-feature +rdrnd -target-feature +rdseed -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK,X64
-// RUN: %clang_cc1 -no-opaque-pointers -ffreestanding %s -triple=i386-unknown-unknown -target-feature +rdrnd -target-feature +rdseed -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -no-opaque-pointers -ffreestanding %s -triple=i386-unknown-unknown -target-feature +rdrnd -target-feature +rdseed -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK,X86
 
 #include 
 
@@ -17,14 +17,61 @@
 // CHECK: store i32
 }
 
-#if __x86_64__
 int rdrand64(unsigned long long *p) {
   return _rdrand64_step(p);
 // X64: @rdrand64
 // X64: call { i64, i32 } @llvm.x86.rdrand.64
 // X64: store i64
+
+// X86-LABEL: @rdrand64(
+// X86-NEXT:  entry:
+// X86-NEXT:[[RETVAL_I:%.*]] = alloca i32, align 4
+// X86-NEXT:[[__P_ADDR_I:%.*]] = alloca i64*, align 4
+// X86-NEXT:[[__LO_I:%.*]] = alloca i32, align 4
+// X86-NEXT:[[__HI_I:%.*]] = alloca i32, align 4
+// X86-NEXT:[[__RES_LO_I:%.*]] = alloca i32, align 4
+// X86-NEXT:[[__RES_HI_I:%.*]] = alloca i32, align 4
+// X86-NEXT:[[P_ADDR:%.*]] = alloca i64*, align 4
+// X86-NEXT:store i64* [[P:%.*]], i64** [[P_ADDR]], align 4
+// X86-NEXT:[[TMP0:%.*]] = load i64*, i64** [[P_ADDR]], align 4
+// X86-NEXT:store i64* [[TMP0]], i64** [[__P_ADDR_I]], align 4
+// X86-NEXT:[[TMP1:%.*]] = call { i32, i32 } @llvm.x86.rdrand.32()
+// X86-NEXT:[[TMP2:%.*]] = extractvalue { i32, i32 } [[TMP1]], 0
+// X86-NEXT:store i32 [[TMP2]], i32* [[__LO_I]], align 4
+// X86-NEXT:[[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP1]], 1
+// X86-NEXT:store i32 [[TMP3]], i32* [[__RES_LO_I]], align 4
+// X86-NEXT:[[TMP4:%.*]] = call { i32, i32 } @llvm.x86.rdrand.32()
+// X86-NEXT:[[TMP5:%.*]] = extractvalue { i32, i32 } [[TMP4]], 0
+// X86-NEXT:store i32 [[TMP5]], i32* [[__HI_I]], align 4
+// X86-NEXT:[[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP4]], 1
+// X86-NEXT:store i32 [[TMP6]], i32* [[__RES_HI_I]], align 4
+// X86-NEXT:[[TMP7:%.*]] = load i32, i32* [[__RES_LO_I]], align 4
+// X86-NEXT:[[TOBOOL_I:%.*]] = icmp ne i32 [[TMP7]], 0
+// X86-NEXT:br i1 [[TOBOOL_I]], label [[LAND_LHS_TRUE_I:%.*]], label [[IF_ELSE_I:%.*]]
+// X86:   land.lhs.true.i:
+// X86-NEXT:[[TMP8:%.*]] = load i32, i32* [[__RES_HI_I]], align 4
+// X86-NEXT:[[TOBOOL1_I:%.*]] = icmp ne i32 [[TMP8]], 0
+// X86-NEXT:br i1 [[TOBOOL1_I]], label [[IF_THEN_I:%.*]], label [[IF_ELSE_I]]
+// X86:   if.then.i:
+// X86-NEXT:[[TMP9:%.*]] = load i32, i32* [[__HI_I]], align 4
+// X86-NEXT:[[CONV_I:%.*]] = zext i32 [[TMP9]] to i64
+// X86-NEXT:[[SHL_I:%.*]] = shl i64 [[CONV_I]], 32
+// X86-NEXT:[[TMP10:%.*]] = load i32, i32* [[__LO_I]], align 4
+// X86-NEXT:[[CONV2_I:%.*]] = zext i32 [[TMP10]] to i64
+// X86-NEXT:[[OR_I:%.*]] = or i64 [[SHL_I]], [[CONV2_I]]
+// X86-NEXT:[[TMP11:%.*]] = load i64*, i64** [[__P_ADDR_I]], align 4
+// X86-NEXT:store i64 [[OR_I]], i64* [[TMP11]], align 4
+// X86-NEXT:store i32 1, i32* [[RETVAL_I]], align 4
+// X86-NEXT:br label [[_RDRAND64_STEP_EXIT:%.*]]
+// X86:   if.else.i:
+// X86-NEXT:[[TMP12:%.*]] = load i64*, i64** [[__P_ADDR_I]], align 4
+// X86-NEXT:store i64 0, i64* [[TMP12]], align 4
+// X86-NEXT:store i32 0, i32* [[RETVAL_I]], align 4
+// X86-NEXT:br label [[_RDRAND64_STEP_EXIT]]
+// X86:   _rdrand64_step.exit:
+// X86-NEXT:[[TMP13:%.*]] = load i32, i32* [[RETVAL_I]], align 4
+// X86-NEXT:ret i32 [[TMP13]]
 }
-#endif
 
 int rdseed16(unsigned short *p) {
   return _rdseed16_step(p);
Index: clang/lib/Headers/immintrin.h
===
--- clang/lib/Headers/immintrin.h
+++ clang/lib/Headers/immintrin.h
@@ -287,6 +287,23 @@
 {
   return (int)__builtin_ia32_rdrand64_step(__p);
 }
+#else
+// We need to emulate the functionality of 64-bit rdrand with 2 32-bit
+// rdrand instructions.
+static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
+_rdrand64_step(unsigned long long *__p)
+{
+  unsigned int __lo, __hi;
+  int __res_lo = 

[PATCH] D132141: [X86] Emulate _rdrand64_step with two rdrand32 if it is 32bit

2022-08-23 Thread Bing Yu via Phabricator via cfe-commits
yubing updated this revision to Diff 454738.
yubing added a comment.

Execute the second rdrand32 despite of whether the first one fail or not


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D132141/new/

https://reviews.llvm.org/D132141

Files:
  clang/lib/Headers/immintrin.h
  clang/test/CodeGen/X86/rdrand-builtins.c

Index: clang/test/CodeGen/X86/rdrand-builtins.c
===
--- clang/test/CodeGen/X86/rdrand-builtins.c
+++ clang/test/CodeGen/X86/rdrand-builtins.c
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -no-opaque-pointers -ffreestanding %s -triple=x86_64-unknown-unknown -target-feature +rdrnd -target-feature +rdseed -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK,X64
-// RUN: %clang_cc1 -no-opaque-pointers -ffreestanding %s -triple=i386-unknown-unknown -target-feature +rdrnd -target-feature +rdseed -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -no-opaque-pointers -ffreestanding %s -triple=i386-unknown-unknown -target-feature +rdrnd -target-feature +rdseed -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK,X86
 
 #include 
 
@@ -17,14 +17,61 @@
 // CHECK: store i32
 }
 
-#if __x86_64__
 int rdrand64(unsigned long long *p) {
   return _rdrand64_step(p);
 // X64: @rdrand64
 // X64: call { i64, i32 } @llvm.x86.rdrand.64
 // X64: store i64
+
+// X86-LABEL: @rdrand64(
+// X86-NEXT:  entry:
+// X86-NEXT:[[RETVAL_I:%.*]] = alloca i32, align 4
+// X86-NEXT:[[__P_ADDR_I:%.*]] = alloca i64*, align 4
+// X86-NEXT:[[__LO_I:%.*]] = alloca i32, align 4
+// X86-NEXT:[[__HI_I:%.*]] = alloca i32, align 4
+// X86-NEXT:[[__RES_LO_I:%.*]] = alloca i32, align 4
+// X86-NEXT:[[__RES_HI_I:%.*]] = alloca i32, align 4
+// X86-NEXT:[[P_ADDR:%.*]] = alloca i64*, align 4
+// X86-NEXT:store i64* [[P:%.*]], i64** [[P_ADDR]], align 4
+// X86-NEXT:[[TMP0:%.*]] = load i64*, i64** [[P_ADDR]], align 4
+// X86-NEXT:store i64* [[TMP0]], i64** [[__P_ADDR_I]], align 4
+// X86-NEXT:[[TMP1:%.*]] = call { i32, i32 } @llvm.x86.rdrand.32()
+// X86-NEXT:[[TMP2:%.*]] = extractvalue { i32, i32 } [[TMP1]], 0
+// X86-NEXT:store i32 [[TMP2]], i32* [[__LO_I]], align 4
+// X86-NEXT:[[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP1]], 1
+// X86-NEXT:store i32 [[TMP3]], i32* [[__RES_LO_I]], align 4
+// X86-NEXT:[[TMP4:%.*]] = call { i32, i32 } @llvm.x86.rdrand.32()
+// X86-NEXT:[[TMP5:%.*]] = extractvalue { i32, i32 } [[TMP4]], 0
+// X86-NEXT:store i32 [[TMP5]], i32* [[__HI_I]], align 4
+// X86-NEXT:[[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP4]], 1
+// X86-NEXT:store i32 [[TMP6]], i32* [[__RES_HI_I]], align 4
+// X86-NEXT:[[TMP7:%.*]] = load i32, i32* [[__RES_LO_I]], align 4
+// X86-NEXT:[[TOBOOL_I:%.*]] = icmp ne i32 [[TMP7]], 0
+// X86-NEXT:br i1 [[TOBOOL_I]], label [[LAND_LHS_TRUE_I:%.*]], label [[IF_ELSE_I:%.*]]
+// X86:   land.lhs.true.i:
+// X86-NEXT:[[TMP8:%.*]] = load i32, i32* [[__RES_HI_I]], align 4
+// X86-NEXT:[[TOBOOL1_I:%.*]] = icmp ne i32 [[TMP8]], 0
+// X86-NEXT:br i1 [[TOBOOL1_I]], label [[IF_THEN_I:%.*]], label [[IF_ELSE_I]]
+// X86:   if.then.i:
+// X86-NEXT:[[TMP9:%.*]] = load i32, i32* [[__HI_I]], align 4
+// X86-NEXT:[[CONV_I:%.*]] = zext i32 [[TMP9]] to i64
+// X86-NEXT:[[SHL_I:%.*]] = shl i64 [[CONV_I]], 32
+// X86-NEXT:[[TMP10:%.*]] = load i32, i32* [[__LO_I]], align 4
+// X86-NEXT:[[CONV2_I:%.*]] = zext i32 [[TMP10]] to i64
+// X86-NEXT:[[OR_I:%.*]] = or i64 [[SHL_I]], [[CONV2_I]]
+// X86-NEXT:[[TMP11:%.*]] = load i64*, i64** [[__P_ADDR_I]], align 4
+// X86-NEXT:store i64 [[OR_I]], i64* [[TMP11]], align 4
+// X86-NEXT:store i32 1, i32* [[RETVAL_I]], align 4
+// X86-NEXT:br label [[_RDRAND64_STEP_EXIT:%.*]]
+// X86:   if.else.i:
+// X86-NEXT:[[TMP12:%.*]] = load i64*, i64** [[__P_ADDR_I]], align 4
+// X86-NEXT:store i64 0, i64* [[TMP12]], align 4
+// X86-NEXT:store i32 0, i32* [[RETVAL_I]], align 4
+// X86-NEXT:br label [[_RDRAND64_STEP_EXIT]]
+// X86:   _rdrand64_step.exit:
+// X86-NEXT:[[TMP13:%.*]] = load i32, i32* [[RETVAL_I]], align 4
+// X86-NEXT:ret i32 [[TMP13]]
 }
-#endif
 
 int rdseed16(unsigned short *p) {
   return _rdseed16_step(p);
Index: clang/lib/Headers/immintrin.h
===
--- clang/lib/Headers/immintrin.h
+++ clang/lib/Headers/immintrin.h
@@ -291,6 +291,23 @@
 {
   return (int)__builtin_ia32_rdrand64_step(__p);
 }
+#else
+// We need to emulate the functionality of 64-bit rdrand with 2 32-bit
+// rdrand instructions.
+static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
+_rdrand64_step(unsigned long long *__p)
+{
+  unsigned int __lo, __hi;
+  int __res_lo = __builtin_ia32_rdrand32_step(&__lo);
+  int __res_hi = __builtin_ia32_rdrand32_step(&__hi);
+  if 

[PATCH] D132141: [X86] Emulate _rdrand64_step with two rdrand32 if it is 32bit

2022-08-22 Thread Bing Yu via Phabricator via cfe-commits
yubing added inline comments.



Comment at: clang/lib/Headers/immintrin.h:301
+  unsigned long long tmp;
+  if (__builtin_ia32_rdrand32_step((unsigned int *)) &
+  __builtin_ia32_rdrand32_step(((unsigned int *)) + 1)) {

RKSimon wrote:
> RKSimon wrote:
> > craig.topper wrote:
> > > craig.topper wrote:
> > > > Should `&` be `&&`?
> > > Can we avoid the pointer cast here? Use two unsigned ints and manually 
> > > concatenate them to a 64-bit value.
> > +1
> > ```
> > unsigned int lo, hi;
> > if (__builtin_ia32_rdrand32_step() &&
> > __builtin_ia32_rdrand32_step()) {
> >   *p = ((unsigned long)hi << 32) | lo;
> >   return 1;
> > }
> > ```
> Are there any sideeffects that we might encounter by not always performing 
> both __builtin_ia32_rdrand32_step calls?
> ```
>   unsigned int __lo, __hi;
>   int __res_lo = __builtin_ia32_rdrand32_step(&__lo);
>   int __res_hi = __builtin_ia32_rdrand32_step(&__hi);
>   if (__res_lo && __res_hi) {
> *__p = ((unsigned long long)__hi << 32) | (unsigned long long)__lo;
> return 1;
>   } else {
> *__p = 0;
> return 0;
>   }
> ```
however, if the first rdrand32 failed, then we don't need to execute the second 
one.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D132141/new/

https://reviews.llvm.org/D132141

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D132141: [X86] Emulate _rdrand64_step with two rdrand32 if it is 32bit

2022-08-21 Thread Bing Yu via Phabricator via cfe-commits
yubing updated this revision to Diff 454357.
yubing added a comment.

address craig's comments


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D132141/new/

https://reviews.llvm.org/D132141

Files:
  clang/lib/Headers/immintrin.h
  clang/test/CodeGen/X86/rdrand-builtins.c


Index: clang/test/CodeGen/X86/rdrand-builtins.c
===
--- clang/test/CodeGen/X86/rdrand-builtins.c
+++ clang/test/CodeGen/X86/rdrand-builtins.c
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -no-opaque-pointers -ffreestanding %s 
-triple=x86_64-unknown-unknown -target-feature +rdrnd -target-feature +rdseed 
-emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK,X64
-// RUN: %clang_cc1 -no-opaque-pointers -ffreestanding %s 
-triple=i386-unknown-unknown -target-feature +rdrnd -target-feature +rdseed 
-emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -no-opaque-pointers -ffreestanding %s 
-triple=i386-unknown-unknown -target-feature +rdrnd -target-feature +rdseed 
-emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK,X86
 
 #include 
 
@@ -17,14 +17,55 @@
 // CHECK: store i32
 }
 
-#if __x86_64__
 int rdrand64(unsigned long long *p) {
   return _rdrand64_step(p);
 // X64: @rdrand64
 // X64: call { i64, i32 } @llvm.x86.rdrand.64
 // X64: store i64
+
+// X86-LABEL: @rdrand64(
+// X86-NEXT:  entry:
+// X86-NEXT:[[RETVAL_I:%.*]] = alloca i32, align 4
+// X86-NEXT:[[__P_ADDR_I:%.*]] = alloca i64*, align 4
+// X86-NEXT:[[LO_I:%.*]] = alloca i32, align 4
+// X86-NEXT:[[HI_I:%.*]] = alloca i32, align 4
+// X86-NEXT:[[P_ADDR:%.*]] = alloca i64*, align 4
+// X86-NEXT:store i64* [[P:%.*]], i64** [[P_ADDR]], align 4
+// X86-NEXT:[[TMP0:%.*]] = load i64*, i64** [[P_ADDR]], align 4
+// X86-NEXT:store i64* [[TMP0]], i64** [[__P_ADDR_I]], align 4
+// X86-NEXT:[[TMP1:%.*]] = call { i32, i32 } @llvm.x86.rdrand.32()
+// X86-NEXT:[[TMP2:%.*]] = extractvalue { i32, i32 } [[TMP1]], 0
+// X86-NEXT:store i32 [[TMP2]], i32* [[LO_I]], align 4
+// X86-NEXT:[[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP1]], 1
+// X86-NEXT:[[TOBOOL_I:%.*]] = icmp ne i32 [[TMP3]], 0
+// X86-NEXT:br i1 [[TOBOOL_I]], label [[LAND_LHS_TRUE_I:%.*]], label 
[[IF_ELSE_I:%.*]]
+// X86:   land.lhs.true.i:
+// X86-NEXT:[[TMP4:%.*]] = call { i32, i32 } @llvm.x86.rdrand.32()
+// X86-NEXT:[[TMP5:%.*]] = extractvalue { i32, i32 } [[TMP4]], 0
+// X86-NEXT:store i32 [[TMP5]], i32* [[HI_I]], align 4
+// X86-NEXT:[[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP4]], 1
+// X86-NEXT:[[TOBOOL1_I:%.*]] = icmp ne i32 [[TMP6]], 0
+// X86-NEXT:br i1 [[TOBOOL1_I]], label [[IF_THEN_I:%.*]], label 
[[IF_ELSE_I]]
+// X86:   if.then.i:
+// X86-NEXT:[[TMP7:%.*]] = load i32, i32* [[HI_I]], align 4
+// X86-NEXT:[[CONV_I:%.*]] = zext i32 [[TMP7]] to i64
+// X86-NEXT:[[SHL_I:%.*]] = shl i64 [[CONV_I]], 32
+// X86-NEXT:[[TMP8:%.*]] = load i32, i32* [[LO_I]], align 4
+// X86-NEXT:[[CONV2_I:%.*]] = zext i32 [[TMP8]] to i64
+// X86-NEXT:[[OR_I:%.*]] = or i64 [[SHL_I]], [[CONV2_I]]
+// X86-NEXT:[[TMP9:%.*]] = load i64*, i64** [[__P_ADDR_I]], align 4
+// X86-NEXT:store i64 [[OR_I]], i64* [[TMP9]], align 4
+// X86-NEXT:store i32 1, i32* [[RETVAL_I]], align 4
+// X86-NEXT:br label [[_RDRAND64_STEP_EXIT:%.*]]
+// X86:   if.else.i:
+// X86-NEXT:[[TMP10:%.*]] = load i64*, i64** [[__P_ADDR_I]], align 4
+// X86-NEXT:store i64 0, i64* [[TMP10]], align 4
+// X86-NEXT:store i32 0, i32* [[RETVAL_I]], align 4
+// X86-NEXT:br label [[_RDRAND64_STEP_EXIT]]
+// X86:   _rdrand64_step.exit:
+// X86-NEXT:[[TMP11:%.*]] = load i32, i32* [[RETVAL_I]], align 4
+// X86-NEXT:ret i32 [[TMP11]]
 }
-#endif
 
 int rdseed16(unsigned short *p) {
   return _rdseed16_step(p);
Index: clang/lib/Headers/immintrin.h
===
--- clang/lib/Headers/immintrin.h
+++ clang/lib/Headers/immintrin.h
@@ -291,6 +291,21 @@
 {
   return (int)__builtin_ia32_rdrand64_step(__p);
 }
+#else
+// We need to emulate the functionality of 64-bit rdrand with 2 32-bit
+// rdrand instructions.
+static __inline__ int __attribute__((__always_inline__, __nodebug__, 
__target__("rdrnd")))
+_rdrand64_step(unsigned long long *__p)
+{
+  unsigned int __lo, __hi;
+  if (__builtin_ia32_rdrand32_step(&__lo) && 
__builtin_ia32_rdrand32_step(&__hi)) {
+*__p = ((unsigned long long)__hi << 32) | (unsigned long long)__lo;
+return 1;
+  } else {
+*__p = 0;
+return 0;
+  }
+}
 #endif
 #endif /* __RDRND__ */
 


Index: clang/test/CodeGen/X86/rdrand-builtins.c
===
--- clang/test/CodeGen/X86/rdrand-builtins.c
+++ clang/test/CodeGen/X86/rdrand-builtins.c
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -no-opaque-pointers -ffreestanding %s 

[PATCH] D132141: [X86] Emulate _rdrand64_step with two rdrand32 if it is 32bit

2022-08-21 Thread Bing Yu via Phabricator via cfe-commits
yubing updated this revision to Diff 454356.
yubing added a comment.

address simon's comments


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D132141/new/

https://reviews.llvm.org/D132141

Files:
  clang/lib/Headers/immintrin.h
  clang/test/CodeGen/X86/rdrand-builtins.c


Index: clang/test/CodeGen/X86/rdrand-builtins.c
===
--- clang/test/CodeGen/X86/rdrand-builtins.c
+++ clang/test/CodeGen/X86/rdrand-builtins.c
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -no-opaque-pointers -ffreestanding %s 
-triple=x86_64-unknown-unknown -target-feature +rdrnd -target-feature +rdseed 
-emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK,X64
-// RUN: %clang_cc1 -no-opaque-pointers -ffreestanding %s 
-triple=i386-unknown-unknown -target-feature +rdrnd -target-feature +rdseed 
-emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -no-opaque-pointers -ffreestanding %s 
-triple=i386-unknown-unknown -target-feature +rdrnd -target-feature +rdseed 
-emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK,X86
 
 #include 
 
@@ -17,14 +17,55 @@
 // CHECK: store i32
 }
 
-#if __x86_64__
 int rdrand64(unsigned long long *p) {
   return _rdrand64_step(p);
 // X64: @rdrand64
 // X64: call { i64, i32 } @llvm.x86.rdrand.64
 // X64: store i64
+
+// X86-LABEL: @rdrand64(
+// X86-NEXT:  entry:
+// X86-NEXT:[[RETVAL_I:%.*]] = alloca i32, align 4
+// X86-NEXT:[[__P_ADDR_I:%.*]] = alloca i64*, align 4
+// X86-NEXT:[[LO_I:%.*]] = alloca i32, align 4
+// X86-NEXT:[[HI_I:%.*]] = alloca i32, align 4
+// X86-NEXT:[[P_ADDR:%.*]] = alloca i64*, align 4
+// X86-NEXT:store i64* [[P:%.*]], i64** [[P_ADDR]], align 4
+// X86-NEXT:[[TMP0:%.*]] = load i64*, i64** [[P_ADDR]], align 4
+// X86-NEXT:store i64* [[TMP0]], i64** [[__P_ADDR_I]], align 4
+// X86-NEXT:[[TMP1:%.*]] = call { i32, i32 } @llvm.x86.rdrand.32()
+// X86-NEXT:[[TMP2:%.*]] = extractvalue { i32, i32 } [[TMP1]], 0
+// X86-NEXT:store i32 [[TMP2]], i32* [[LO_I]], align 4
+// X86-NEXT:[[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP1]], 1
+// X86-NEXT:[[TOBOOL_I:%.*]] = icmp ne i32 [[TMP3]], 0
+// X86-NEXT:br i1 [[TOBOOL_I]], label [[LAND_LHS_TRUE_I:%.*]], label 
[[IF_ELSE_I:%.*]]
+// X86:   land.lhs.true.i:
+// X86-NEXT:[[TMP4:%.*]] = call { i32, i32 } @llvm.x86.rdrand.32()
+// X86-NEXT:[[TMP5:%.*]] = extractvalue { i32, i32 } [[TMP4]], 0
+// X86-NEXT:store i32 [[TMP5]], i32* [[HI_I]], align 4
+// X86-NEXT:[[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP4]], 1
+// X86-NEXT:[[TOBOOL1_I:%.*]] = icmp ne i32 [[TMP6]], 0
+// X86-NEXT:br i1 [[TOBOOL1_I]], label [[IF_THEN_I:%.*]], label 
[[IF_ELSE_I]]
+// X86:   if.then.i:
+// X86-NEXT:[[TMP7:%.*]] = load i32, i32* [[HI_I]], align 4
+// X86-NEXT:[[CONV_I:%.*]] = zext i32 [[TMP7]] to i64
+// X86-NEXT:[[SHL_I:%.*]] = shl i64 [[CONV_I]], 32
+// X86-NEXT:[[TMP8:%.*]] = load i32, i32* [[LO_I]], align 4
+// X86-NEXT:[[CONV2_I:%.*]] = zext i32 [[TMP8]] to i64
+// X86-NEXT:[[OR_I:%.*]] = or i64 [[SHL_I]], [[CONV2_I]]
+// X86-NEXT:[[TMP9:%.*]] = load i64*, i64** [[__P_ADDR_I]], align 4
+// X86-NEXT:store i64 [[OR_I]], i64* [[TMP9]], align 4
+// X86-NEXT:store i32 1, i32* [[RETVAL_I]], align 4
+// X86-NEXT:br label [[_RDRAND64_STEP_EXIT:%.*]]
+// X86:   if.else.i:
+// X86-NEXT:[[TMP10:%.*]] = load i64*, i64** [[__P_ADDR_I]], align 4
+// X86-NEXT:store i64 0, i64* [[TMP10]], align 4
+// X86-NEXT:store i32 0, i32* [[RETVAL_I]], align 4
+// X86-NEXT:br label [[_RDRAND64_STEP_EXIT]]
+// X86:   _rdrand64_step.exit:
+// X86-NEXT:[[TMP11:%.*]] = load i32, i32* [[RETVAL_I]], align 4
+// X86-NEXT:ret i32 [[TMP11]]
 }
-#endif
 
 int rdseed16(unsigned short *p) {
   return _rdseed16_step(p);
Index: clang/lib/Headers/immintrin.h
===
--- clang/lib/Headers/immintrin.h
+++ clang/lib/Headers/immintrin.h
@@ -291,6 +291,21 @@
 {
   return (int)__builtin_ia32_rdrand64_step(__p);
 }
+#else
+// We need to emulate the functionality of 64-bit rdrand with 2 32-bit
+// rdrand instructions.
+static __inline__ int __attribute__((__always_inline__, __nodebug__, 
__target__("rdrnd")))
+_rdrand64_step(unsigned long long *__p)
+{
+  unsigned int lo, hi;
+  if (__builtin_ia32_rdrand32_step() && __builtin_ia32_rdrand32_step()) {
+*__p = ((unsigned long long)hi << 32) | (unsigned long long)lo;
+return 1;
+  } else {
+*__p = 0;
+return 0;
+  }
+}
 #endif
 #endif /* __RDRND__ */
 


Index: clang/test/CodeGen/X86/rdrand-builtins.c
===
--- clang/test/CodeGen/X86/rdrand-builtins.c
+++ clang/test/CodeGen/X86/rdrand-builtins.c
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -no-opaque-pointers -ffreestanding %s -triple=x86_64-unknown-unknown -target-feature 

[PATCH] D132141: [X86] Emulate _rdrand64_step with two rdrand32 if it is 32bit

2022-08-18 Thread Bing Yu via Phabricator via cfe-commits
yubing updated this revision to Diff 453866.
yubing added a comment.

fix a small issue


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D132141/new/

https://reviews.llvm.org/D132141

Files:
  clang/lib/Headers/immintrin.h
  clang/test/CodeGen/X86/rdrand-builtins.c


Index: clang/test/CodeGen/X86/rdrand-builtins.c
===
--- clang/test/CodeGen/X86/rdrand-builtins.c
+++ clang/test/CodeGen/X86/rdrand-builtins.c
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -no-opaque-pointers -ffreestanding %s 
-triple=x86_64-unknown-unknown -target-feature +rdrnd -target-feature +rdseed 
-emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK,X64
-// RUN: %clang_cc1 -no-opaque-pointers -ffreestanding %s 
-triple=i386-unknown-unknown -target-feature +rdrnd -target-feature +rdseed 
-emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -no-opaque-pointers -ffreestanding %s 
-triple=i386-unknown-unknown -target-feature +rdrnd -target-feature +rdseed 
-emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK,X86
 
 #include 
 
@@ -24,6 +24,52 @@
 // X64: call { i64, i32 } @llvm.x86.rdrand.64
 // X64: store i64
 }
+#else
+int rdrand64(unsigned long long *p) {
+// X86-LABEL: @rdrand64(
+// X86-NEXT:  entry:
+// X86-NEXT:[[RETVAL_I:%.*]] = alloca i32, align 4
+// X86-NEXT:[[__P_ADDR_I:%.*]] = alloca i64*, align 4
+// X86-NEXT:[[LO_I:%.*]] = alloca i32, align 4
+// X86-NEXT:[[HI_I:%.*]] = alloca i32, align 4
+// X86-NEXT:[[P_ADDR:%.*]] = alloca i64*, align 4
+// X86-NEXT:store i64* [[P:%.*]], i64** [[P_ADDR]], align 4
+// X86-NEXT:[[TMP0:%.*]] = load i64*, i64** [[P_ADDR]], align 4
+// X86-NEXT:store i64* [[TMP0]], i64** [[__P_ADDR_I]], align 4
+// X86-NEXT:[[TMP1:%.*]] = call { i32, i32 } @llvm.x86.rdrand.32()
+// X86-NEXT:[[TMP2:%.*]] = extractvalue { i32, i32 } [[TMP1]], 0
+// X86-NEXT:store i32 [[TMP2]], i32* [[LO_I]], align 4
+// X86-NEXT:[[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP1]], 1
+// X86-NEXT:[[TOBOOL_I:%.*]] = icmp ne i32 [[TMP3]], 0
+// X86-NEXT:br i1 [[TOBOOL_I]], label [[LAND_LHS_TRUE_I:%.*]], label 
[[IF_ELSE_I:%.*]]
+// X86:   land.lhs.true.i:
+// X86-NEXT:[[TMP4:%.*]] = call { i32, i32 } @llvm.x86.rdrand.32()
+// X86-NEXT:[[TMP5:%.*]] = extractvalue { i32, i32 } [[TMP4]], 0
+// X86-NEXT:store i32 [[TMP5]], i32* [[HI_I]], align 4
+// X86-NEXT:[[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP4]], 1
+// X86-NEXT:[[TOBOOL1_I:%.*]] = icmp ne i32 [[TMP6]], 0
+// X86-NEXT:br i1 [[TOBOOL1_I]], label [[IF_THEN_I:%.*]], label 
[[IF_ELSE_I]]
+// X86:   if.then.i:
+// X86-NEXT:[[TMP7:%.*]] = load i32, i32* [[HI_I]], align 4
+// X86-NEXT:[[CONV_I:%.*]] = zext i32 [[TMP7]] to i64
+// X86-NEXT:[[SHL_I:%.*]] = shl i64 [[CONV_I]], 32
+// X86-NEXT:[[TMP8:%.*]] = load i32, i32* [[LO_I]], align 4
+// X86-NEXT:[[CONV2_I:%.*]] = zext i32 [[TMP8]] to i64
+// X86-NEXT:[[OR_I:%.*]] = or i64 [[SHL_I]], [[CONV2_I]]
+// X86-NEXT:[[TMP9:%.*]] = load i64*, i64** [[__P_ADDR_I]], align 4
+// X86-NEXT:store i64 [[OR_I]], i64* [[TMP9]], align 4
+// X86-NEXT:store i32 1, i32* [[RETVAL_I]], align 4
+// X86-NEXT:br label [[_RDRAND64_STEP_EXIT:%.*]]
+// X86:   if.else.i:
+// X86-NEXT:[[TMP10:%.*]] = load i64*, i64** [[__P_ADDR_I]], align 4
+// X86-NEXT:store i64 0, i64* [[TMP10]], align 4
+// X86-NEXT:store i32 0, i32* [[RETVAL_I]], align 4
+// X86-NEXT:br label [[_RDRAND64_STEP_EXIT]]
+// X86:   _rdrand64_step.exit:
+// X86-NEXT:[[TMP11:%.*]] = load i32, i32* [[RETVAL_I]], align 4
+// X86-NEXT:ret i32 [[TMP11]]
+  return _rdrand64_step(p);
+}
 #endif
 
 int rdseed16(unsigned short *p) {
Index: clang/lib/Headers/immintrin.h
===
--- clang/lib/Headers/immintrin.h
+++ clang/lib/Headers/immintrin.h
@@ -291,6 +291,21 @@
 {
   return (int)__builtin_ia32_rdrand64_step(__p);
 }
+#else
+// We need to emulate the functionality of 64-bit rdrand with 2 32-bit
+// rdrand instructions.
+static __inline__ int __attribute__((__always_inline__, __nodebug__, 
__target__("rdrnd")))
+_rdrand64_step(unsigned long long *__p)
+{
+  unsigned int lo, hi;
+  if (__builtin_ia32_rdrand32_step() && __builtin_ia32_rdrand32_step()) {
+*__p = ((unsigned long long)hi << 32) | (unsigned long long)lo;
+return 1;
+  } else {
+*__p = 0;
+return 0;
+  }
+}
 #endif
 #endif /* __RDRND__ */
 


Index: clang/test/CodeGen/X86/rdrand-builtins.c
===
--- clang/test/CodeGen/X86/rdrand-builtins.c
+++ clang/test/CodeGen/X86/rdrand-builtins.c
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -no-opaque-pointers -ffreestanding %s -triple=x86_64-unknown-unknown -target-feature +rdrnd -target-feature +rdseed -emit-llvm -o - -Wall -Werror | FileCheck %s 

[PATCH] D132141: [X86] Emulate _rdrand64_step with two rdrand32 if it is 32bit

2022-08-18 Thread Bing Yu via Phabricator via cfe-commits
yubing updated this revision to Diff 453865.
yubing added a comment.

Address comments


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D132141/new/

https://reviews.llvm.org/D132141

Files:
  clang/lib/Headers/immintrin.h
  clang/test/CodeGen/X86/rdrand-builtins.c


Index: clang/test/CodeGen/X86/rdrand-builtins.c
===
--- clang/test/CodeGen/X86/rdrand-builtins.c
+++ clang/test/CodeGen/X86/rdrand-builtins.c
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -no-opaque-pointers -ffreestanding %s 
-triple=x86_64-unknown-unknown -target-feature +rdrnd -target-feature +rdseed 
-emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK,X64
-// RUN: %clang_cc1 -no-opaque-pointers -ffreestanding %s 
-triple=i386-unknown-unknown -target-feature +rdrnd -target-feature +rdseed 
-emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -no-opaque-pointers -ffreestanding %s 
-triple=i386-unknown-unknown -target-feature +rdrnd -target-feature +rdseed 
-emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK,X86
 
 #include 
 
@@ -24,6 +24,53 @@
 // X64: call { i64, i32 } @llvm.x86.rdrand.64
 // X64: store i64
 }
+#else
+int rdrand64(unsigned long long *p) {
+// X86-LABEL: @rdrand64(
+// X86-NEXT:  entry:
+// X86-NEXT:[[RETVAL_I:%.*]] = alloca i32, align 4
+// X86-NEXT:[[__P_ADDR_I:%.*]] = alloca i64*, align 4
+// X86-NEXT:[[LO_I:%.*]] = alloca i32, align 4
+// X86-NEXT:[[HI_I:%.*]] = alloca i32, align 4
+// X86-NEXT:[[P_ADDR:%.*]] = alloca i64*, align 4
+// X86-NEXT:store i64* [[P:%.*]], i64** [[P_ADDR]], align 4
+// X86-NEXT:[[TMP0:%.*]] = load i64*, i64** [[P_ADDR]], align 4
+// X86-NEXT:store i64* [[TMP0]], i64** [[__P_ADDR_I]], align 4
+// X86-NEXT:[[TMP1:%.*]] = call { i32, i32 } @llvm.x86.rdrand.32()
+// X86-NEXT:[[TMP2:%.*]] = extractvalue { i32, i32 } [[TMP1]], 0
+// X86-NEXT:store i32 [[TMP2]], i32* [[LO_I]], align 4
+// X86-NEXT:[[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP1]], 1
+// X86-NEXT:[[TOBOOL_I:%.*]] = icmp ne i32 [[TMP3]], 0
+// X86-NEXT:br i1 [[TOBOOL_I]], label [[LAND_LHS_TRUE_I:%.*]], label 
[[IF_ELSE_I:%.*]]
+// X86:   land.lhs.true.i:
+// X86-NEXT:[[TMP4:%.*]] = call { i32, i32 } @llvm.x86.rdrand.32()
+// X86-NEXT:[[TMP5:%.*]] = extractvalue { i32, i32 } [[TMP4]], 0
+// X86-NEXT:store i32 [[TMP5]], i32* [[HI_I]], align 4
+// X86-NEXT:[[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP4]], 1
+// X86-NEXT:[[TOBOOL1_I:%.*]] = icmp ne i32 [[TMP6]], 0
+// X86-NEXT:br i1 [[TOBOOL1_I]], label [[IF_THEN_I:%.*]], label 
[[IF_ELSE_I]]
+// X86:   if.then.i:
+// X86-NEXT:[[TMP7:%.*]] = load i32, i32* [[HI_I]], align 4
+// X86-NEXT:[[CONV_I:%.*]] = zext i32 [[TMP7]] to i64
+// X86-NEXT:[[SHL_I:%.*]] = shl i64 [[CONV_I]], 32
+// X86-NEXT:[[TMP8:%.*]] = load i32, i32* [[LO_I]], align 4
+// X86-NEXT:[[CONV2_I:%.*]] = zext i32 [[TMP8]] to i64
+// X86-NEXT:[[OR_I:%.*]] = or i64 [[SHL_I]], [[CONV2_I]]
+// X86-NEXT:[[TMP9:%.*]] = load i64*, i64** [[__P_ADDR_I]], align 4
+// X86-NEXT:store i64 [[OR_I]], i64* [[TMP9]], align 4
+// X86-NEXT:store i32 1, i32* [[RETVAL_I]], align 4
+// X86-NEXT:br label [[_RDRAND64_STEP_EXIT:%.*]]
+// X86:   if.else.i:
+// X86-NEXT:[[TMP10:%.*]] = load i64*, i64** [[__P_ADDR_I]], align 4
+// X86-NEXT:store i64 0, i64* [[TMP10]], align 4
+// X86-NEXT:store i32 0, i32* [[RETVAL_I]], align 4
+// X86-NEXT:br label [[_RDRAND64_STEP_EXIT]]
+// X86:   _rdrand64_step.exit:
+// X86-NEXT:[[TMP11:%.*]] = load i32, i32* [[RETVAL_I]], align 4
+// X86-NEXT:ret i32 [[TMP11]]
+;
+  return _rdrand64_step(p);
+}
 #endif
 
 int rdseed16(unsigned short *p) {
Index: clang/lib/Headers/immintrin.h
===
--- clang/lib/Headers/immintrin.h
+++ clang/lib/Headers/immintrin.h
@@ -291,6 +291,21 @@
 {
   return (int)__builtin_ia32_rdrand64_step(__p);
 }
+#else
+// We need to emulate the functionality of 64-bit rdrand with 2 32-bit
+// rdrand instructions.
+static __inline__ int __attribute__((__always_inline__, __nodebug__, 
__target__("rdrnd")))
+_rdrand64_step(unsigned long long *__p)
+{
+  unsigned int lo, hi;
+  if (__builtin_ia32_rdrand32_step() && __builtin_ia32_rdrand32_step()) {
+*__p = ((unsigned long long)hi << 32) | (unsigned long long)lo;
+return 1;
+  } else {
+*__p = 0;
+return 0;
+  }
+}
 #endif
 #endif /* __RDRND__ */
 


Index: clang/test/CodeGen/X86/rdrand-builtins.c
===
--- clang/test/CodeGen/X86/rdrand-builtins.c
+++ clang/test/CodeGen/X86/rdrand-builtins.c
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -no-opaque-pointers -ffreestanding %s -triple=x86_64-unknown-unknown -target-feature +rdrnd -target-feature +rdseed -emit-llvm -o - -Wall -Werror | FileCheck %s 

[PATCH] D132141: [X86] Emulate _rdrand64_step with two rdrand32 if it is 32bit

2022-08-18 Thread Bing Yu via Phabricator via cfe-commits
yubing created this revision.
Herald added a subscriber: pengfei.
Herald added a project: All.
yubing requested review of this revision.
Herald added a project: clang.
Herald added a subscriber: cfe-commits.

Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D132141

Files:
  clang/lib/Headers/immintrin.h
  clang/test/CodeGen/X86/rdrand-builtins.c


Index: clang/test/CodeGen/X86/rdrand-builtins.c
===
--- clang/test/CodeGen/X86/rdrand-builtins.c
+++ clang/test/CodeGen/X86/rdrand-builtins.c
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -no-opaque-pointers -ffreestanding %s 
-triple=x86_64-unknown-unknown -target-feature +rdrnd -target-feature +rdseed 
-emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK,X64
-// RUN: %clang_cc1 -no-opaque-pointers -ffreestanding %s 
-triple=i386-unknown-unknown -target-feature +rdrnd -target-feature +rdseed 
-emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -no-opaque-pointers -ffreestanding %s 
-triple=i386-unknown-unknown -target-feature +rdrnd -target-feature +rdseed 
-emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK,X32
 
 #include 
 
@@ -24,6 +24,29 @@
 // X64: call { i64, i32 } @llvm.x86.rdrand.64
 // X64: store i64
 }
+#else
+int rdrand64(unsigned long long *p) {
+// X32: @rdrand64
+// X32: [[RETVAL_I:%.*]] = alloca i32, align 4
+// X32: call { i32, i32 } @llvm.x86.rdrand.32
+// X32: store i32
+// X32: call { i32, i32 } @llvm.x86.rdrand.32
+// X32: store i32
+// X32: [[AND_I:%.*]] = and i32
+// X32: [[TOBOOL_I:%.*]] = icmp ne i32 [[AND_I]], 0
+// X32: br i1 [[TOBOOL_I]], label [[IF_THEN_I:%.*]], label [[IF_ELSE_I:%.*]]
+// X32: if.then.i:
+// X32: store i64
+// X32: store i32 1, i32* [[RETVAL_I]], align 4
+// X32: br label [[_RDRAND64_STEP_EXIT:%.*]]
+// X32: if.else.i:
+// X32: store i64 0
+// X32: store i32 0, i32* [[RETVAL_I]], align 4
+// X32: br label [[_RDRAND64_STEP_EXIT]]
+// X32: _rdrand64_step.exit:
+// X32: %{{.*}} = load i32, i32* [[RETVAL_I]], align 4
+  return _rdrand64_step(p);
+}
 #endif
 
 int rdseed16(unsigned short *p) {
Index: clang/lib/Headers/immintrin.h
===
--- clang/lib/Headers/immintrin.h
+++ clang/lib/Headers/immintrin.h
@@ -291,6 +291,22 @@
 {
   return (int)__builtin_ia32_rdrand64_step(__p);
 }
+#else
+// We need to emulate the functionality of 64-bit rdrand with 2 32-bit
+// rdrand instructions.
+static __inline__ int __attribute__((__always_inline__, __nodebug__, 
__target__("rdrnd")))
+_rdrand64_step(unsigned long long *__p)
+{
+  unsigned long long tmp;
+  if (__builtin_ia32_rdrand32_step((unsigned int *)) &
+  __builtin_ia32_rdrand32_step(((unsigned int *)) + 1)) {
+*__p = tmp;
+return 1;
+  } else {
+*__p = 0;
+return 0;
+  }
+}
 #endif
 #endif /* __RDRND__ */
 


Index: clang/test/CodeGen/X86/rdrand-builtins.c
===
--- clang/test/CodeGen/X86/rdrand-builtins.c
+++ clang/test/CodeGen/X86/rdrand-builtins.c
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -no-opaque-pointers -ffreestanding %s -triple=x86_64-unknown-unknown -target-feature +rdrnd -target-feature +rdseed -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK,X64
-// RUN: %clang_cc1 -no-opaque-pointers -ffreestanding %s -triple=i386-unknown-unknown -target-feature +rdrnd -target-feature +rdseed -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -no-opaque-pointers -ffreestanding %s -triple=i386-unknown-unknown -target-feature +rdrnd -target-feature +rdseed -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK,X32
 
 #include 
 
@@ -24,6 +24,29 @@
 // X64: call { i64, i32 } @llvm.x86.rdrand.64
 // X64: store i64
 }
+#else
+int rdrand64(unsigned long long *p) {
+// X32: @rdrand64
+// X32: [[RETVAL_I:%.*]] = alloca i32, align 4
+// X32: call { i32, i32 } @llvm.x86.rdrand.32
+// X32: store i32
+// X32: call { i32, i32 } @llvm.x86.rdrand.32
+// X32: store i32
+// X32: [[AND_I:%.*]] = and i32
+// X32: [[TOBOOL_I:%.*]] = icmp ne i32 [[AND_I]], 0
+// X32: br i1 [[TOBOOL_I]], label [[IF_THEN_I:%.*]], label [[IF_ELSE_I:%.*]]
+// X32: if.then.i:
+// X32: store i64
+// X32: store i32 1, i32* [[RETVAL_I]], align 4
+// X32: br label [[_RDRAND64_STEP_EXIT:%.*]]
+// X32: if.else.i:
+// X32: store i64 0
+// X32: store i32 0, i32* [[RETVAL_I]], align 4
+// X32: br label [[_RDRAND64_STEP_EXIT]]
+// X32: _rdrand64_step.exit:
+// X32: %{{.*}} = load i32, i32* [[RETVAL_I]], align 4
+  return _rdrand64_step(p);
+}
 #endif
 
 int rdseed16(unsigned short *p) {
Index: clang/lib/Headers/immintrin.h
===
--- clang/lib/Headers/immintrin.h
+++ clang/lib/Headers/immintrin.h
@@ -291,6 +291,22 @@
 {
   return (int)__builtin_ia32_rdrand64_step(__p);
 }
+#else
+// We need to emulate the functionality of 64-bit 

[PATCH] D115630: [CodeGen] Require use of Address::invalid() for invalid address

2021-12-14 Thread Bing Yu via Phabricator via cfe-commits
yubing added inline comments.



Comment at: clang/lib/CodeGen/Address.h:79
 class ConstantAddress : public Address {
+  ConstantAddress(nullptr_t) : Address(nullptr) {}
+

Has anyone encountered buildfail due to missing "std" before nullptr_t?


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D115630/new/

https://reviews.llvm.org/D115630

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D87981: [X86] AMX programming model.

2021-06-15 Thread Bing Yu via Phabricator via cfe-commits
yubing added inline comments.



Comment at: llvm/lib/Target/X86/X86ISelDAGToDAG.cpp:4675
 }
+case Intrinsic::x86_tilestored64_internal: {
+  unsigned Opc = X86::PTILESTOREDV;

It seems there should be a check here, according to line4575:

```
if (!Subtarget->hasAMXTILE())
  break;
```






Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D87981/new/

https://reviews.llvm.org/D87981

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D103784: [X86] Support __tile_stream_loadd intrinsic for new AMX interface

2021-06-11 Thread Bing Yu via Phabricator via cfe-commits
This revision was landed with ongoing or failed builds.
This revision was automatically updated to reflect the committed changes.
Closed by commit rG56d5c46b494d: [X86] Support __tile_stream_loadd intrinsic 
for new AMX interface (authored by yubing).

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D103784/new/

https://reviews.llvm.org/D103784

Files:
  clang/include/clang/Basic/BuiltinsX86_64.def
  clang/lib/Headers/amxintrin.h
  clang/test/CodeGen/X86/amx_api.c
  llvm/include/llvm/IR/IntrinsicsX86.td
  llvm/lib/Target/X86/X86ExpandPseudo.cpp
  llvm/lib/Target/X86/X86FastTileConfig.cpp
  llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
  llvm/lib/Target/X86/X86InstrAMX.td
  llvm/lib/Target/X86/X86LowerAMXType.cpp
  llvm/lib/Target/X86/X86PreAMXConfig.cpp
  llvm/lib/Target/X86/X86RegisterInfo.cpp
  llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll

Index: llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll
===
--- llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll
+++ llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll
@@ -23,6 +23,7 @@
 ; CHECK-NEXT:tdpbusd %tmm2, %tmm1, %tmm0
 ; CHECK-NEXT:tdpbuud %tmm2, %tmm1, %tmm0
 ; CHECK-NEXT:tdpbf16ps %tmm2, %tmm1, %tmm0
+; CHECK-NEXT:tileloaddt1 (%rsi,%rdx), %tmm1
 ; CHECK-NEXT:tilestored %tmm0, (%rdi,%rdx)
 ; CHECK-NEXT:tilerelease
 ; CHECK-NEXT:vzeroupper
@@ -35,6 +36,7 @@
   %d2 = call x86_amx @llvm.x86.tdpbusd.internal(i16 8, i16 8, i16 8, x86_amx %d1, x86_amx %a, x86_amx %b)
   %d3 = call x86_amx @llvm.x86.tdpbuud.internal(i16 8, i16 8, i16 8, x86_amx %d2, x86_amx %a, x86_amx %b)
   %d4 = call x86_amx @llvm.x86.tdpbf16ps.internal(i16 8, i16 8, i16 8, x86_amx %d3, x86_amx %a, x86_amx %b)
+  %e = call x86_amx @llvm.x86.tileloaddt164.internal(i16 8, i16 8, i8* %base, i64 %stride)
   call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %d4)
 
   ret void
@@ -42,6 +44,7 @@
 
 declare x86_amx @llvm.x86.tilezero.internal(i16, i16)
 declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64)
+declare x86_amx @llvm.x86.tileloaddt164.internal(i16, i16, i8*, i64)
 declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
 declare x86_amx @llvm.x86.tdpbsud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
 declare x86_amx @llvm.x86.tdpbusd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
Index: llvm/lib/Target/X86/X86RegisterInfo.cpp
===
--- llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -892,6 +892,7 @@
   }
   // We only collect the tile shape that is defined.
   case X86::PTILELOADDV:
+  case X86::PTILELOADDT1V:
   case X86::PTDPBSSDV:
   case X86::PTDPBSUDV:
   case X86::PTDPBUSDV:
Index: llvm/lib/Target/X86/X86PreAMXConfig.cpp
===
--- llvm/lib/Target/X86/X86PreAMXConfig.cpp
+++ llvm/lib/Target/X86/X86PreAMXConfig.cpp
@@ -65,7 +65,8 @@
 }
 
 static bool isTileLoad(IntrinsicInst *II) {
-  return II->getIntrinsicID() == Intrinsic::x86_tileloadd64_internal;
+  return II->getIntrinsicID() == Intrinsic::x86_tileloadd64_internal ||
+ II->getIntrinsicID() == Intrinsic::x86_tileloaddt164_internal;
 }
 
 static bool isTileStore(IntrinsicInst *II) {
Index: llvm/lib/Target/X86/X86LowerAMXType.cpp
===
--- llvm/lib/Target/X86/X86LowerAMXType.cpp
+++ llvm/lib/Target/X86/X86LowerAMXType.cpp
@@ -121,6 +121,7 @@
   default:
 llvm_unreachable("Expect amx intrinsics");
   case Intrinsic::x86_tileloadd64_internal:
+  case Intrinsic::x86_tileloaddt164_internal:
   case Intrinsic::x86_tilestored64_internal: {
 Row = II->getArgOperand(0);
 Col = II->getArgOperand(1);
Index: llvm/lib/Target/X86/X86InstrAMX.td
===
--- llvm/lib/Target/X86/X86InstrAMX.td
+++ llvm/lib/Target/X86/X86InstrAMX.td
@@ -53,6 +53,9 @@
 def PTILELOADDV : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
  GR16:$src2,
  opaquemem:$src3), []>;
+def PTILELOADDT1V : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
+   GR16:$src2,
+   opaquemem:$src3), []>;
 def PTILESTOREDV : PseudoI<(outs), (ins GR16:$src1,
 GR16:$src2, opaquemem:$src3,
 TILE:$src4), []>;
Index: llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
===
--- llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -4617,10 +4617,13 @@
   ReplaceNode(Node, Res);
   return;
 }
-case 

[PATCH] D103784: [X86] Support __tile_stream_loadd intrinsic for new AMX interface

2021-06-09 Thread Bing Yu via Phabricator via cfe-commits
yubing updated this revision to Diff 350803.
yubing added a comment.

Rebase


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D103784/new/

https://reviews.llvm.org/D103784

Files:
  clang/include/clang/Basic/BuiltinsX86_64.def
  clang/lib/Headers/amxintrin.h
  clang/test/CodeGen/X86/amx_api.c
  llvm/include/llvm/IR/IntrinsicsX86.td
  llvm/lib/Target/X86/X86ExpandPseudo.cpp
  llvm/lib/Target/X86/X86FastTileConfig.cpp
  llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
  llvm/lib/Target/X86/X86InstrAMX.td
  llvm/lib/Target/X86/X86LowerAMXType.cpp
  llvm/lib/Target/X86/X86PreAMXConfig.cpp
  llvm/lib/Target/X86/X86RegisterInfo.cpp
  llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll

Index: llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll
===
--- llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll
+++ llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll
@@ -23,6 +23,7 @@
 ; CHECK-NEXT:tdpbusd %tmm2, %tmm1, %tmm0
 ; CHECK-NEXT:tdpbuud %tmm2, %tmm1, %tmm0
 ; CHECK-NEXT:tdpbf16ps %tmm2, %tmm1, %tmm0
+; CHECK-NEXT:tileloaddt1 (%rsi,%rdx), %tmm1
 ; CHECK-NEXT:tilestored %tmm0, (%rdi,%rdx)
 ; CHECK-NEXT:tilerelease
 ; CHECK-NEXT:vzeroupper
@@ -35,6 +36,7 @@
   %d2 = call x86_amx @llvm.x86.tdpbusd.internal(i16 8, i16 8, i16 8, x86_amx %d1, x86_amx %a, x86_amx %b)
   %d3 = call x86_amx @llvm.x86.tdpbuud.internal(i16 8, i16 8, i16 8, x86_amx %d2, x86_amx %a, x86_amx %b)
   %d4 = call x86_amx @llvm.x86.tdpbf16ps.internal(i16 8, i16 8, i16 8, x86_amx %d3, x86_amx %a, x86_amx %b)
+  %e = call x86_amx @llvm.x86.tileloaddt164.internal(i16 8, i16 8, i8* %base, i64 %stride)
   call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %d4)
 
   ret void
@@ -42,6 +44,7 @@
 
 declare x86_amx @llvm.x86.tilezero.internal(i16, i16)
 declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64)
+declare x86_amx @llvm.x86.tileloaddt164.internal(i16, i16, i8*, i64)
 declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
 declare x86_amx @llvm.x86.tdpbsud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
 declare x86_amx @llvm.x86.tdpbusd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
Index: llvm/lib/Target/X86/X86RegisterInfo.cpp
===
--- llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -892,6 +892,7 @@
   }
   // We only collect the tile shape that is defined.
   case X86::PTILELOADDV:
+  case X86::PTILELOADDT1V:
   case X86::PTDPBSSDV:
   case X86::PTDPBSUDV:
   case X86::PTDPBUSDV:
Index: llvm/lib/Target/X86/X86PreAMXConfig.cpp
===
--- llvm/lib/Target/X86/X86PreAMXConfig.cpp
+++ llvm/lib/Target/X86/X86PreAMXConfig.cpp
@@ -65,7 +65,8 @@
 }
 
 static bool isTileLoad(IntrinsicInst *II) {
-  return II->getIntrinsicID() == Intrinsic::x86_tileloadd64_internal;
+  return II->getIntrinsicID() == Intrinsic::x86_tileloadd64_internal ||
+ II->getIntrinsicID() == Intrinsic::x86_tileloaddt164_internal;
 }
 
 static bool isTileStore(IntrinsicInst *II) {
Index: llvm/lib/Target/X86/X86LowerAMXType.cpp
===
--- llvm/lib/Target/X86/X86LowerAMXType.cpp
+++ llvm/lib/Target/X86/X86LowerAMXType.cpp
@@ -121,6 +121,7 @@
   default:
 llvm_unreachable("Expect amx intrinsics");
   case Intrinsic::x86_tileloadd64_internal:
+  case Intrinsic::x86_tileloaddt164_internal:
   case Intrinsic::x86_tilestored64_internal: {
 Row = II->getArgOperand(0);
 Col = II->getArgOperand(1);
Index: llvm/lib/Target/X86/X86InstrAMX.td
===
--- llvm/lib/Target/X86/X86InstrAMX.td
+++ llvm/lib/Target/X86/X86InstrAMX.td
@@ -53,6 +53,9 @@
 def PTILELOADDV : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
  GR16:$src2,
  opaquemem:$src3), []>;
+def PTILELOADDT1V : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
+   GR16:$src2,
+   opaquemem:$src3), []>;
 def PTILESTOREDV : PseudoI<(outs), (ins GR16:$src1,
 GR16:$src2, opaquemem:$src3,
 TILE:$src4), []>;
Index: llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
===
--- llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -4617,10 +4617,13 @@
   ReplaceNode(Node, Res);
   return;
 }
-case Intrinsic::x86_tileloadd64_internal: {
+case Intrinsic::x86_tileloadd64_internal:
+case Intrinsic::x86_tileloaddt164_internal: {
   if (!Subtarget->hasAMXTILE())
 

[PATCH] D103784: [X86] Support __tile_stream_loadd intrinsic for new AMX interface

2021-06-09 Thread Bing Yu via Phabricator via cfe-commits
yubing updated this revision to Diff 350797.
yubing added a comment.

Address yuanke's comments


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D103784/new/

https://reviews.llvm.org/D103784

Files:
  clang/include/clang/Basic/BuiltinsX86_64.def
  clang/lib/Headers/amxintrin.h
  clang/test/CodeGen/X86/amx_api.c
  llvm/include/llvm/IR/IntrinsicsX86.td
  llvm/lib/Target/X86/X86ExpandPseudo.cpp
  llvm/lib/Target/X86/X86FastTileConfig.cpp
  llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
  llvm/lib/Target/X86/X86InstrAMX.td
  llvm/lib/Target/X86/X86LowerAMXType.cpp
  llvm/lib/Target/X86/X86PreAMXConfig.cpp
  llvm/lib/Target/X86/X86RegisterInfo.cpp
  llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll

Index: llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll
===
--- llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll
+++ llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll
@@ -23,6 +23,7 @@
 ; CHECK-NEXT:tdpbusd %tmm2, %tmm1, %tmm0
 ; CHECK-NEXT:tdpbuud %tmm2, %tmm1, %tmm0
 ; CHECK-NEXT:tdpbf16ps %tmm2, %tmm1, %tmm0
+; CHECK-NEXT:tileloaddt1 (%rsi,%rdx), %tmm1
 ; CHECK-NEXT:tilestored %tmm0, (%rdi,%rdx)
 ; CHECK-NEXT:tilerelease
 ; CHECK-NEXT:vzeroupper
@@ -35,6 +36,7 @@
   %d2 = call x86_amx @llvm.x86.tdpbusd.internal(i16 8, i16 8, i16 8, x86_amx %d1, x86_amx %a, x86_amx %b)
   %d3 = call x86_amx @llvm.x86.tdpbuud.internal(i16 8, i16 8, i16 8, x86_amx %d2, x86_amx %a, x86_amx %b)
   %d4 = call x86_amx @llvm.x86.tdpbf16ps.internal(i16 8, i16 8, i16 8, x86_amx %d3, x86_amx %a, x86_amx %b)
+  %e = call x86_amx @llvm.x86.tileloaddt164.internal(i16 8, i16 8, i8* %base, i64 %stride)
   call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %d4)
 
   ret void
@@ -42,6 +44,7 @@
 
 declare x86_amx @llvm.x86.tilezero.internal(i16, i16)
 declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64)
+declare x86_amx @llvm.x86.tileloaddt164.internal(i16, i16, i8*, i64)
 declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
 declare x86_amx @llvm.x86.tdpbsud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
 declare x86_amx @llvm.x86.tdpbusd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
Index: llvm/lib/Target/X86/X86RegisterInfo.cpp
===
--- llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -892,6 +892,7 @@
   }
   // We only collect the tile shape that is defined.
   case X86::PTILELOADDV:
+  case X86::PTILELOADDT1V:
   case X86::PTDPBSSDV:
   case X86::PTDPBSUDV:
   case X86::PTDPBUSDV:
Index: llvm/lib/Target/X86/X86PreAMXConfig.cpp
===
--- llvm/lib/Target/X86/X86PreAMXConfig.cpp
+++ llvm/lib/Target/X86/X86PreAMXConfig.cpp
@@ -65,7 +65,8 @@
 }
 
 static bool isTileLoad(IntrinsicInst *II) {
-  return II->getIntrinsicID() == Intrinsic::x86_tileloadd64_internal;
+  return II->getIntrinsicID() == Intrinsic::x86_tileloadd64_internal ||
+ II->getIntrinsicID() == Intrinsic::x86_tileloaddt164_internal;
 }
 
 static bool isTileStore(IntrinsicInst *II) {
Index: llvm/lib/Target/X86/X86LowerAMXType.cpp
===
--- llvm/lib/Target/X86/X86LowerAMXType.cpp
+++ llvm/lib/Target/X86/X86LowerAMXType.cpp
@@ -121,6 +121,7 @@
   default:
 llvm_unreachable("Expect amx intrinsics");
   case Intrinsic::x86_tileloadd64_internal:
+  case Intrinsic::x86_tileloaddt164_internal:
   case Intrinsic::x86_tilestored64_internal: {
 Row = II->getArgOperand(0);
 Col = II->getArgOperand(1);
Index: llvm/lib/Target/X86/X86InstrAMX.td
===
--- llvm/lib/Target/X86/X86InstrAMX.td
+++ llvm/lib/Target/X86/X86InstrAMX.td
@@ -53,6 +53,9 @@
 def PTILELOADDV : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
  GR16:$src2,
  opaquemem:$src3), []>;
+def PTILELOADDT1V : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
+   GR16:$src2,
+   opaquemem:$src3), []>;
 def PTILESTOREDV : PseudoI<(outs), (ins GR16:$src1,
 GR16:$src2, opaquemem:$src3,
 TILE:$src4), []>;
Index: llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
===
--- llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -4617,10 +4617,13 @@
   ReplaceNode(Node, Res);
   return;
 }
-case Intrinsic::x86_tileloadd64_internal: {
+case Intrinsic::x86_tileloadd64_internal:
+case Intrinsic::x86_tileloaddt164_internal: {
   if 

[PATCH] D103784: [X86] Support __tile_stream_loadd intrinsic for new AMX interface

2021-06-06 Thread Bing Yu via Phabricator via cfe-commits
yubing created this revision.
Herald added subscribers: pengfei, hiraditya.
yubing requested review of this revision.
Herald added projects: clang, LLVM.
Herald added subscribers: llvm-commits, cfe-commits.

Adding support for __tile_stream_loadd intrinsic.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D103784

Files:
  clang/include/clang/Basic/BuiltinsX86_64.def
  clang/lib/Headers/amxintrin.h
  clang/test/CodeGen/X86/amx_api.c
  llvm/include/llvm/IR/IntrinsicsX86.td
  llvm/lib/Target/X86/X86ExpandPseudo.cpp
  llvm/lib/Target/X86/X86FastTileConfig.cpp
  llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
  llvm/lib/Target/X86/X86InstrAMX.td
  llvm/lib/Target/X86/X86LowerAMXType.cpp
  llvm/lib/Target/X86/X86RegisterInfo.cpp
  llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll

Index: llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll
===
--- llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll
+++ llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll
@@ -23,6 +23,7 @@
 ; CHECK-NEXT:tdpbusd %tmm2, %tmm1, %tmm0
 ; CHECK-NEXT:tdpbuud %tmm2, %tmm1, %tmm0
 ; CHECK-NEXT:tdpbf16ps %tmm2, %tmm1, %tmm0
+; CHECK-NEXT:tileloaddt1 (%rsi,%rdx), %tmm1
 ; CHECK-NEXT:tilestored %tmm0, (%rdi,%rdx)
 ; CHECK-NEXT:tilerelease
 ; CHECK-NEXT:vzeroupper
@@ -35,6 +36,7 @@
   %d2 = call x86_amx @llvm.x86.tdpbusd.internal(i16 8, i16 8, i16 8, x86_amx %d1, x86_amx %a, x86_amx %b)
   %d3 = call x86_amx @llvm.x86.tdpbuud.internal(i16 8, i16 8, i16 8, x86_amx %d2, x86_amx %a, x86_amx %b)
   %d4 = call x86_amx @llvm.x86.tdpbf16ps.internal(i16 8, i16 8, i16 8, x86_amx %d3, x86_amx %a, x86_amx %b)
+  %e = call x86_amx @llvm.x86.tileloaddt164.internal(i16 8, i16 8, i8* %base, i64 %stride)
   call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %d4)
 
   ret void
@@ -42,6 +44,7 @@
 
 declare x86_amx @llvm.x86.tilezero.internal(i16, i16)
 declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64)
+declare x86_amx @llvm.x86.tileloaddt164.internal(i16, i16, i8*, i64)
 declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
 declare x86_amx @llvm.x86.tdpbsud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
 declare x86_amx @llvm.x86.tdpbusd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
Index: llvm/lib/Target/X86/X86RegisterInfo.cpp
===
--- llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -892,6 +892,7 @@
   }
   // We only collect the tile shape that is defined.
   case X86::PTILELOADDV:
+  case X86::PTILELOADDT1V:
   case X86::PTDPBSSDV:
   case X86::PTDPBSUDV:
   case X86::PTDPBUSDV:
Index: llvm/lib/Target/X86/X86LowerAMXType.cpp
===
--- llvm/lib/Target/X86/X86LowerAMXType.cpp
+++ llvm/lib/Target/X86/X86LowerAMXType.cpp
@@ -121,6 +121,7 @@
   default:
 llvm_unreachable("Expect amx intrinsics");
   case Intrinsic::x86_tileloadd64_internal:
+  case Intrinsic::x86_tileloaddt164_internal:
   case Intrinsic::x86_tilestored64_internal: {
 Row = II->getArgOperand(0);
 Col = II->getArgOperand(1);
Index: llvm/lib/Target/X86/X86InstrAMX.td
===
--- llvm/lib/Target/X86/X86InstrAMX.td
+++ llvm/lib/Target/X86/X86InstrAMX.td
@@ -53,6 +53,9 @@
 def PTILELOADDV : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
  GR16:$src2,
  opaquemem:$src3), []>;
+def PTILELOADDT1V : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
+ GR16:$src2,
+ opaquemem:$src3), []>;
 def PTILESTOREDV : PseudoI<(outs), (ins GR16:$src1,
 GR16:$src2, opaquemem:$src3,
 TILE:$src4), []>;
Index: llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
===
--- llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -4617,10 +4617,13 @@
   ReplaceNode(Node, Res);
   return;
 }
-case Intrinsic::x86_tileloadd64_internal: {
+case Intrinsic::x86_tileloadd64_internal:
+case Intrinsic::x86_tileloaddt164_internal: {
   if (!Subtarget->hasAMXTILE())
 break;
-  unsigned Opc = X86::PTILELOADDV;
+  unsigned Opc = IntNo == Intrinsic::x86_tileloadd64_internal
+ ? X86::PTILELOADDV
+ : X86::PTILELOADDT1V;
   // _tile_loadd_internal(row, col, buf, STRIDE)
   SDValue Base = Node->getOperand(4);
   SDValue Scale = getI8Imm(1, dl);
Index: llvm/lib/Target/X86/X86FastTileConfig.cpp
===
--- 

[PATCH] D96110: [X86] Pass to transform tdpbf16ps intrinsics to scalar operation.

2021-03-21 Thread Bing Yu via Phabricator via cfe-commits
This revision was automatically updated to reflect the committed changes.
Closed by commit rG113f077f808f: [X86] Pass to transform tdpbf16ps intrinsics 
to scalar operation. (authored by yubing).

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D96110/new/

https://reviews.llvm.org/D96110

Files:
  llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
  llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll

Index: llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
===
--- llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
+++ llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
@@ -97,8 +97,8 @@
   ret void
 }
 
-define dso_local void @test_amx_dp(i16 signext %row, i16 signext %col, i16 signext %k, <256 x i32> %c, <256 x i32> %a, <256 x i32> %b, <256 x i32>* %vptr) #0 {
-; CHECK-LABEL: @test_amx_dp(
+define dso_local void @test_amx_dpbssd(i16 signext %row, i16 signext %col, i16 signext %k, <256 x i32> %c, <256 x i32> %a, <256 x i32> %b, <256 x i32>* %vptr) #0 {
+; CHECK-LABEL: @test_amx_dpbssd(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:[[A_AMX:%.*]] = bitcast <256 x i32> [[A:%.*]] to x86_amx
 ; CHECK-NEXT:[[B_AMX:%.*]] = bitcast <256 x i32> [[B:%.*]] to x86_amx
@@ -172,6 +172,84 @@
   ret void
 }
 
+define dso_local void @test_amx_dpbf16ps(i16 signext %row, i16 signext %col, i16 signext %k, <256 x i32> %c, <256 x i32> %a, <256 x i32> %b, <256 x i32>* %vptr) #0 {
+; CHECK-LABEL: @test_amx_dpbf16ps(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:[[A_AMX:%.*]] = bitcast <256 x i32> [[A:%.*]] to x86_amx
+; CHECK-NEXT:[[B_AMX:%.*]] = bitcast <256 x i32> [[B:%.*]] to x86_amx
+; CHECK-NEXT:[[C_AMX:%.*]] = bitcast <256 x i32> [[C:%.*]] to x86_amx
+; CHECK-NEXT:[[TMP0:%.*]] = lshr i16 [[COL:%.*]], 2
+; CHECK-NEXT:[[TMP1:%.*]] = lshr i16 [[K:%.*]], 2
+; CHECK-NEXT:br label [[TDPBF16PS_SCALARIZE_ROWS_HEADER:%.*]]
+; CHECK:   tdpbf16ps.scalarize.rows.header:
+; CHECK-NEXT:[[TDPBF16PS_SCALARIZE_ROWS_IV:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TDPBF16PS_SCALARIZE_ROWS_STEP:%.*]], [[TDPBF16PS_SCALARIZE_ROWS_LATCH:%.*]] ]
+; CHECK-NEXT:[[VEC_C_PHI_ROW:%.*]] = phi <256 x i32> [ [[C]], [[ENTRY]] ], [ [[TMP21:%.*]], [[TDPBF16PS_SCALARIZE_ROWS_LATCH]] ]
+; CHECK-NEXT:[[VEC_D_PHI_ROW:%.*]] = phi <256 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[TMP23:%.*]], [[TDPBF16PS_SCALARIZE_ROWS_LATCH]] ]
+; CHECK-NEXT:br label [[TDPBF16PS_SCALARIZE_ROWS_BODY:%.*]]
+; CHECK:   tdpbf16ps.scalarize.rows.body:
+; CHECK-NEXT:br label [[TDPBF16PS_SCALARIZE_COLS_HEADER:%.*]]
+; CHECK:   tdpbf16ps.scalarize.cols.header:
+; CHECK-NEXT:[[TDPBF16PS_SCALARIZE_COLS_IV:%.*]] = phi i16 [ 0, [[TDPBF16PS_SCALARIZE_ROWS_BODY]] ], [ [[TDPBF16PS_SCALARIZE_COLS_STEP:%.*]], [[TDPBF16PS_SCALARIZE_COLS_LATCH:%.*]] ]
+; CHECK-NEXT:[[VEC_C_PHI_COL:%.*]] = phi <256 x i32> [ [[VEC_C_PHI_ROW]], [[TDPBF16PS_SCALARIZE_ROWS_BODY]] ], [ [[TMP21]], [[TDPBF16PS_SCALARIZE_COLS_LATCH]] ]
+; CHECK-NEXT:[[VEC_D_PHI_COL:%.*]] = phi <256 x i32> [ [[VEC_D_PHI_ROW]], [[TDPBF16PS_SCALARIZE_ROWS_BODY]] ], [ [[TMP23]], [[TDPBF16PS_SCALARIZE_COLS_LATCH]] ]
+; CHECK-NEXT:[[TMP2:%.*]] = mul i16 [[TDPBF16PS_SCALARIZE_ROWS_IV]], 16
+; CHECK-NEXT:[[TMP3:%.*]] = add i16 [[TMP2]], [[TDPBF16PS_SCALARIZE_COLS_IV]]
+; CHECK-NEXT:br label [[TDPBF16PS_SCALARIZE_COLS_BODY:%.*]]
+; CHECK:   tdpbf16ps.scalarize.cols.body:
+; CHECK-NEXT:br label [[TDPBF16PS_SCALARIZE_INNER_HEADER:%.*]]
+; CHECK:   tdpbf16ps.scalarize.inner.header:
+; CHECK-NEXT:[[TDPBF16PS_SCALARIZE_INNER_IV:%.*]] = phi i16 [ 0, [[TDPBF16PS_SCALARIZE_COLS_BODY]] ], [ [[TDPBF16PS_SCALARIZE_INNER_STEP:%.*]], [[TDPBF16PS_SCALARIZE_INNER_LATCH:%.*]] ]
+; CHECK-NEXT:[[VEC_C_INNER_PHI:%.*]] = phi <256 x i32> [ [[VEC_C_PHI_COL]], [[TDPBF16PS_SCALARIZE_COLS_BODY]] ], [ [[TMP21]], [[TDPBF16PS_SCALARIZE_INNER_LATCH]] ]
+; CHECK-NEXT:br label [[TDPBF16PS_SCALARIZE_INNER_BODY:%.*]]
+; CHECK:   tdpbf16ps.scalarize.inner.body:
+; CHECK-NEXT:[[TMP4:%.*]] = mul i16 [[TDPBF16PS_SCALARIZE_ROWS_IV]], 16
+; CHECK-NEXT:[[TMP5:%.*]] = add i16 [[TMP4]], [[TDPBF16PS_SCALARIZE_INNER_IV]]
+; CHECK-NEXT:[[TMP6:%.*]] = mul i16 [[TDPBF16PS_SCALARIZE_INNER_IV]], 16
+; CHECK-NEXT:[[TMP7:%.*]] = add i16 [[TMP6]], [[TDPBF16PS_SCALARIZE_COLS_IV]]
+; CHECK-NEXT:[[TMP8:%.*]] = extractelement <256 x i32> [[VEC_C_INNER_PHI]], i16 [[TMP3]]
+; CHECK-NEXT:[[TMP9:%.*]] = bitcast i32 [[TMP8]] to float
+; CHECK-NEXT:[[TMP10:%.*]] = extractelement <256 x i32> [[A]], i16 [[TMP5]]
+; CHECK-NEXT:[[TMP11:%.*]] = bitcast i32 [[TMP10]] to <2 x i16>
+; CHECK-NEXT:[[TMP12:%.*]] = extractelement <256 x i32> [[B]], i16 [[TMP7]]
+; CHECK-NEXT:[[TMP13:%.*]] = bitcast i32 [[TMP12]] to <2 x i16>
+; CHECK-NEXT:[[TMP14:%.*]] = shufflevector <2 x i16> [[TMP11]], <2 x i16> zeroinitializer, <4 x i32> 
+; CHECK-NEXT:[[TMP15:%.*]] = bitcast <4 x i16> [[TMP14]] to <2 x 

[PATCH] D96110: [X86] Pass to transform tdpbf16ps intrinsics to scalar operation.

2021-03-21 Thread Bing Yu via Phabricator via cfe-commits
yubing updated this revision to Diff 332190.
yubing added a comment.

Rebase after https://reviews.llvm.org/D98773 is merged.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D96110/new/

https://reviews.llvm.org/D96110

Files:
  llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
  llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll

Index: llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
===
--- llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
+++ llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
@@ -97,8 +97,8 @@
   ret void
 }
 
-define dso_local void @test_amx_dp(i16 signext %row, i16 signext %col, i16 signext %k, <256 x i32> %c, <256 x i32> %a, <256 x i32> %b, <256 x i32>* %vptr) #0 {
-; CHECK-LABEL: @test_amx_dp(
+define dso_local void @test_amx_dpbssd(i16 signext %row, i16 signext %col, i16 signext %k, <256 x i32> %c, <256 x i32> %a, <256 x i32> %b, <256 x i32>* %vptr) #0 {
+; CHECK-LABEL: @test_amx_dpbssd(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:[[A_AMX:%.*]] = bitcast <256 x i32> [[A:%.*]] to x86_amx
 ; CHECK-NEXT:[[B_AMX:%.*]] = bitcast <256 x i32> [[B:%.*]] to x86_amx
@@ -172,6 +172,84 @@
   ret void
 }
 
+define dso_local void @test_amx_dpbf16ps(i16 signext %row, i16 signext %col, i16 signext %k, <256 x i32> %c, <256 x i32> %a, <256 x i32> %b, <256 x i32>* %vptr) #0 {
+; CHECK-LABEL: @test_amx_dpbf16ps(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:[[A_AMX:%.*]] = bitcast <256 x i32> [[A:%.*]] to x86_amx
+; CHECK-NEXT:[[B_AMX:%.*]] = bitcast <256 x i32> [[B:%.*]] to x86_amx
+; CHECK-NEXT:[[C_AMX:%.*]] = bitcast <256 x i32> [[C:%.*]] to x86_amx
+; CHECK-NEXT:[[TMP0:%.*]] = lshr i16 [[COL:%.*]], 2
+; CHECK-NEXT:[[TMP1:%.*]] = lshr i16 [[K:%.*]], 2
+; CHECK-NEXT:br label [[TDPBF16PS_SCALARIZE_ROWS_HEADER:%.*]]
+; CHECK:   tdpbf16ps.scalarize.rows.header:
+; CHECK-NEXT:[[TDPBF16PS_SCALARIZE_ROWS_IV:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TDPBF16PS_SCALARIZE_ROWS_STEP:%.*]], [[TDPBF16PS_SCALARIZE_ROWS_LATCH:%.*]] ]
+; CHECK-NEXT:[[VEC_C_PHI_ROW:%.*]] = phi <256 x i32> [ [[C]], [[ENTRY]] ], [ [[TMP21:%.*]], [[TDPBF16PS_SCALARIZE_ROWS_LATCH]] ]
+; CHECK-NEXT:[[VEC_D_PHI_ROW:%.*]] = phi <256 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[TMP23:%.*]], [[TDPBF16PS_SCALARIZE_ROWS_LATCH]] ]
+; CHECK-NEXT:br label [[TDPBF16PS_SCALARIZE_ROWS_BODY:%.*]]
+; CHECK:   tdpbf16ps.scalarize.rows.body:
+; CHECK-NEXT:br label [[TDPBF16PS_SCALARIZE_COLS_HEADER:%.*]]
+; CHECK:   tdpbf16ps.scalarize.cols.header:
+; CHECK-NEXT:[[TDPBF16PS_SCALARIZE_COLS_IV:%.*]] = phi i16 [ 0, [[TDPBF16PS_SCALARIZE_ROWS_BODY]] ], [ [[TDPBF16PS_SCALARIZE_COLS_STEP:%.*]], [[TDPBF16PS_SCALARIZE_COLS_LATCH:%.*]] ]
+; CHECK-NEXT:[[VEC_C_PHI_COL:%.*]] = phi <256 x i32> [ [[VEC_C_PHI_ROW]], [[TDPBF16PS_SCALARIZE_ROWS_BODY]] ], [ [[TMP21]], [[TDPBF16PS_SCALARIZE_COLS_LATCH]] ]
+; CHECK-NEXT:[[VEC_D_PHI_COL:%.*]] = phi <256 x i32> [ [[VEC_D_PHI_ROW]], [[TDPBF16PS_SCALARIZE_ROWS_BODY]] ], [ [[TMP23]], [[TDPBF16PS_SCALARIZE_COLS_LATCH]] ]
+; CHECK-NEXT:[[TMP2:%.*]] = mul i16 [[TDPBF16PS_SCALARIZE_ROWS_IV]], 16
+; CHECK-NEXT:[[TMP3:%.*]] = add i16 [[TMP2]], [[TDPBF16PS_SCALARIZE_COLS_IV]]
+; CHECK-NEXT:br label [[TDPBF16PS_SCALARIZE_COLS_BODY:%.*]]
+; CHECK:   tdpbf16ps.scalarize.cols.body:
+; CHECK-NEXT:br label [[TDPBF16PS_SCALARIZE_INNER_HEADER:%.*]]
+; CHECK:   tdpbf16ps.scalarize.inner.header:
+; CHECK-NEXT:[[TDPBF16PS_SCALARIZE_INNER_IV:%.*]] = phi i16 [ 0, [[TDPBF16PS_SCALARIZE_COLS_BODY]] ], [ [[TDPBF16PS_SCALARIZE_INNER_STEP:%.*]], [[TDPBF16PS_SCALARIZE_INNER_LATCH:%.*]] ]
+; CHECK-NEXT:[[VEC_C_INNER_PHI:%.*]] = phi <256 x i32> [ [[VEC_C_PHI_COL]], [[TDPBF16PS_SCALARIZE_COLS_BODY]] ], [ [[TMP21]], [[TDPBF16PS_SCALARIZE_INNER_LATCH]] ]
+; CHECK-NEXT:br label [[TDPBF16PS_SCALARIZE_INNER_BODY:%.*]]
+; CHECK:   tdpbf16ps.scalarize.inner.body:
+; CHECK-NEXT:[[TMP4:%.*]] = mul i16 [[TDPBF16PS_SCALARIZE_ROWS_IV]], 16
+; CHECK-NEXT:[[TMP5:%.*]] = add i16 [[TMP4]], [[TDPBF16PS_SCALARIZE_INNER_IV]]
+; CHECK-NEXT:[[TMP6:%.*]] = mul i16 [[TDPBF16PS_SCALARIZE_INNER_IV]], 16
+; CHECK-NEXT:[[TMP7:%.*]] = add i16 [[TMP6]], [[TDPBF16PS_SCALARIZE_COLS_IV]]
+; CHECK-NEXT:[[TMP8:%.*]] = extractelement <256 x i32> [[VEC_C_INNER_PHI]], i16 [[TMP3]]
+; CHECK-NEXT:[[TMP9:%.*]] = bitcast i32 [[TMP8]] to float
+; CHECK-NEXT:[[TMP10:%.*]] = extractelement <256 x i32> [[A]], i16 [[TMP5]]
+; CHECK-NEXT:[[TMP11:%.*]] = bitcast i32 [[TMP10]] to <2 x i16>
+; CHECK-NEXT:[[TMP12:%.*]] = extractelement <256 x i32> [[B]], i16 [[TMP7]]
+; CHECK-NEXT:[[TMP13:%.*]] = bitcast i32 [[TMP12]] to <2 x i16>
+; CHECK-NEXT:[[TMP14:%.*]] = shufflevector <2 x i16> [[TMP11]], <2 x i16> zeroinitializer, <4 x i32> 
+; CHECK-NEXT:[[TMP15:%.*]] = bitcast <4 x i16> [[TMP14]] to <2 x float>
+; CHECK-NEXT:[[TMP16:%.*]] = shufflevector <2 x i16> 

[PATCH] D96110: [X86] Pass to transform tdpbf16ps intrinsics to scalar operation.

2021-03-18 Thread Bing Yu via Phabricator via cfe-commits
yubing updated this revision to Diff 331766.
yubing added a comment.

address Pengfei's comments


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D96110/new/

https://reviews.llvm.org/D96110

Files:
  llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
  llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll

Index: llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
===
--- llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
+++ llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
@@ -97,8 +97,8 @@
   ret void
 }
 
-define dso_local void @test_amx_dp(i16 signext %row, i16 signext %col, i16 signext %k, <256 x i32> %c, <256 x i32> %a, <256 x i32> %b, <256 x i32>* %vptr) #0 {
-; CHECK-LABEL: @test_amx_dp(
+define dso_local void @test_amx_dpbssd(i16 signext %row, i16 signext %col, i16 signext %k, <256 x i32> %c, <256 x i32> %a, <256 x i32> %b, <256 x i32>* %vptr) #0 {
+; CHECK-LABEL: @test_amx_dpbssd(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:[[A_AMX:%.*]] = bitcast <256 x i32> [[A:%.*]] to x86_amx
 ; CHECK-NEXT:[[B_AMX:%.*]] = bitcast <256 x i32> [[B:%.*]] to x86_amx
@@ -172,6 +172,84 @@
   ret void
 }
 
+define dso_local void @test_amx_dpbf16ps(i16 signext %row, i16 signext %col, i16 signext %k, <256 x i32> %c, <256 x i32> %a, <256 x i32> %b, <256 x i32>* %vptr) #0 {
+; CHECK-LABEL: @test_amx_dpbf16ps(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:[[A_AMX:%.*]] = bitcast <256 x i32> [[A:%.*]] to x86_amx
+; CHECK-NEXT:[[B_AMX:%.*]] = bitcast <256 x i32> [[B:%.*]] to x86_amx
+; CHECK-NEXT:[[C_AMX:%.*]] = bitcast <256 x i32> [[C:%.*]] to x86_amx
+; CHECK-NEXT:[[TMP0:%.*]] = lshr i16 [[COL:%.*]], 2
+; CHECK-NEXT:[[TMP1:%.*]] = lshr i16 [[K:%.*]], 2
+; CHECK-NEXT:br label [[TDPBF16PS_SCALARIZE_ROWS_HEADER:%.*]]
+; CHECK:   tdpbf16ps.scalarize.rows.header:
+; CHECK-NEXT:[[TDPBF16PS_SCALARIZE_ROWS_IV:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TDPBF16PS_SCALARIZE_ROWS_STEP:%.*]], [[TDPBF16PS_SCALARIZE_ROWS_LATCH:%.*]] ]
+; CHECK-NEXT:[[VEC_C_PHI_ROW:%.*]] = phi <256 x i32> [ [[C]], [[ENTRY]] ], [ [[TMP21:%.*]], [[TDPBF16PS_SCALARIZE_ROWS_LATCH]] ]
+; CHECK-NEXT:[[VEC_D_PHI_ROW:%.*]] = phi <256 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[TMP23:%.*]], [[TDPBF16PS_SCALARIZE_ROWS_LATCH]] ]
+; CHECK-NEXT:br label [[TDPBF16PS_SCALARIZE_ROWS_BODY:%.*]]
+; CHECK:   tdpbf16ps.scalarize.rows.body:
+; CHECK-NEXT:br label [[TDPBF16PS_SCALARIZE_COLS_HEADER:%.*]]
+; CHECK:   tdpbf16ps.scalarize.cols.header:
+; CHECK-NEXT:[[TDPBF16PS_SCALARIZE_COLS_IV:%.*]] = phi i16 [ 0, [[TDPBF16PS_SCALARIZE_ROWS_BODY]] ], [ [[TDPBF16PS_SCALARIZE_COLS_STEP:%.*]], [[TDPBF16PS_SCALARIZE_COLS_LATCH:%.*]] ]
+; CHECK-NEXT:[[VEC_C_PHI_COL:%.*]] = phi <256 x i32> [ [[VEC_C_PHI_ROW]], [[TDPBF16PS_SCALARIZE_ROWS_BODY]] ], [ [[TMP21]], [[TDPBF16PS_SCALARIZE_COLS_LATCH]] ]
+; CHECK-NEXT:[[VEC_D_PHI_COL:%.*]] = phi <256 x i32> [ [[VEC_D_PHI_ROW]], [[TDPBF16PS_SCALARIZE_ROWS_BODY]] ], [ [[TMP23]], [[TDPBF16PS_SCALARIZE_COLS_LATCH]] ]
+; CHECK-NEXT:[[TMP2:%.*]] = mul i16 [[TDPBF16PS_SCALARIZE_ROWS_IV]], 16
+; CHECK-NEXT:[[TMP3:%.*]] = add i16 [[TMP2]], [[TDPBF16PS_SCALARIZE_COLS_IV]]
+; CHECK-NEXT:br label [[TDPBF16PS_SCALARIZE_COLS_BODY:%.*]]
+; CHECK:   tdpbf16ps.scalarize.cols.body:
+; CHECK-NEXT:br label [[TDPBF16PS_SCALARIZE_INNER_HEADER:%.*]]
+; CHECK:   tdpbf16ps.scalarize.inner.header:
+; CHECK-NEXT:[[TDPBF16PS_SCALARIZE_INNER_IV:%.*]] = phi i16 [ 0, [[TDPBF16PS_SCALARIZE_COLS_BODY]] ], [ [[TDPBF16PS_SCALARIZE_INNER_STEP:%.*]], [[TDPBF16PS_SCALARIZE_INNER_LATCH:%.*]] ]
+; CHECK-NEXT:[[VEC_C_INNER_PHI:%.*]] = phi <256 x i32> [ [[VEC_C_PHI_COL]], [[TDPBF16PS_SCALARIZE_COLS_BODY]] ], [ [[TMP21]], [[TDPBF16PS_SCALARIZE_INNER_LATCH]] ]
+; CHECK-NEXT:br label [[TDPBF16PS_SCALARIZE_INNER_BODY:%.*]]
+; CHECK:   tdpbf16ps.scalarize.inner.body:
+; CHECK-NEXT:[[TMP4:%.*]] = mul i16 [[TDPBF16PS_SCALARIZE_ROWS_IV]], 16
+; CHECK-NEXT:[[TMP5:%.*]] = add i16 [[TMP4]], [[TDPBF16PS_SCALARIZE_INNER_IV]]
+; CHECK-NEXT:[[TMP6:%.*]] = mul i16 [[TDPBF16PS_SCALARIZE_INNER_IV]], 16
+; CHECK-NEXT:[[TMP7:%.*]] = add i16 [[TMP6]], [[TDPBF16PS_SCALARIZE_COLS_IV]]
+; CHECK-NEXT:[[TMP8:%.*]] = extractelement <256 x i32> [[VEC_C_INNER_PHI]], i16 [[TMP3]]
+; CHECK-NEXT:[[TMP9:%.*]] = bitcast i32 [[TMP8]] to float
+; CHECK-NEXT:[[TMP10:%.*]] = extractelement <256 x i32> [[A]], i16 [[TMP5]]
+; CHECK-NEXT:[[TMP11:%.*]] = bitcast i32 [[TMP10]] to <2 x i16>
+; CHECK-NEXT:[[TMP12:%.*]] = extractelement <256 x i32> [[B]], i16 [[TMP7]]
+; CHECK-NEXT:[[TMP13:%.*]] = bitcast i32 [[TMP12]] to <2 x i16>
+; CHECK-NEXT:[[TMP14:%.*]] = shufflevector <2 x i16> [[TMP11]], <2 x i16> zeroinitializer, <4 x i32> 
+; CHECK-NEXT:[[TMP15:%.*]] = bitcast <4 x i16> [[TMP14]] to <2 x float>
+; CHECK-NEXT:[[TMP16:%.*]] = shufflevector <2 x i16> [[TMP13]], <2 x i16> 

[PATCH] D96110: [X86] Pass to transform tdpbf16ps intrinsics to scalar operation.

2021-03-18 Thread Bing Yu via Phabricator via cfe-commits
yubing added inline comments.



Comment at: llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp:318-319
+// calculate idxa, idxb, idxc
+// %eltc = extractelement <256 x i32> %vec.c.inner.phi, i16 %idxc
+// %eltcf32 = bitcast i32 %eltc to float
+// %elta = extractelement <256 x i32> %veca, i16 %idxa

pengfei wrote:
> Can we create vecC with <256 x float>?
In fact, we are trying to find a bitcast whose operand is <256 x i32>, as shown 
in line229.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D96110/new/

https://reviews.llvm.org/D96110

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D87981: [X86] AMX programming model.

2021-03-18 Thread Bing Yu via Phabricator via cfe-commits
yubing added inline comments.



Comment at: llvm/lib/Target/X86/X86PreTileConfig.cpp:90
+INITIALIZE_PASS_BEGIN(X86PreTileConfig, "tilepreconfig",
+  "Tile Register Configure", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)

It should be Pre Tile configure instead of Tile Register Configure.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D87981/new/

https://reviews.llvm.org/D87981

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D98685: [X86][AMX] Rename amx-bf16 intrinsic according to correct naming convention

2021-03-16 Thread Bing Yu via Phabricator via cfe-commits
This revision was landed with ongoing or failed builds.
This revision was automatically updated to reflect the committed changes.
Closed by commit rG320b72e9cd77: [X86][AMX] Rename amx-bf16 intrinsic according 
to correct naming convention (authored by yubing).

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D98685/new/

https://reviews.llvm.org/D98685

Files:
  clang/lib/Headers/amxintrin.h
  clang/test/CodeGen/X86/amx_api.c


Index: clang/test/CodeGen/X86/amx_api.c
===
--- clang/test/CodeGen/X86/amx_api.c
+++ clang/test/CodeGen/X86/amx_api.c
@@ -81,9 +81,9 @@
   __tile_zero();
 }
 
-void test_tile_tdpbf16ps(__tile1024i a, __tile1024i b, __tile1024i c) {
-  //CHECK-LABEL: @test_tile_tdpbf16ps
+void test_tile_dpbf16ps(__tile1024i a, __tile1024i b, __tile1024i c) {
+  //CHECK-LABEL: @test_tile_dpbf16ps
   //CHECK: call x86_amx @llvm.x86.tdpbf16ps.internal
   //CHECK-NEXT: {{%.*}} = bitcast x86_amx {{%.*}} to <256 x i32>
-  __tile_tdpbf16ps(, b, c);
+  __tile_dpbf16ps(, b, c);
 }
Index: clang/lib/Headers/amxintrin.h
===
--- clang/lib/Headers/amxintrin.h
+++ clang/lib/Headers/amxintrin.h
@@ -267,8 +267,8 @@
 }
 
 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_BF16
-_tile_tdpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,
- _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+_tile_dpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,
+_tile1024i dst, _tile1024i src1, _tile1024i src2) {
   return __builtin_ia32_tdpbf16ps_internal(m, n, k, dst, src1, src2);
 }
 
@@ -323,10 +323,10 @@
 }
 
 __DEFAULT_FN_ATTRS_BF16
-static void __tile_tdpbf16ps(__tile1024i *dst, __tile1024i src1,
- __tile1024i src2) {
-  dst->tile = _tile_tdpbf16ps_internal(src1.row, src2.col, src1.col, dst->tile,
-   src1.tile, src2.tile);
+static void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src1,
+__tile1024i src2) {
+  dst->tile = _tile_dpbf16ps_internal(src1.row, src2.col, src1.col, dst->tile,
+  src1.tile, src2.tile);
 }
 
 #undef __DEFAULT_FN_ATTRS_TILE


Index: clang/test/CodeGen/X86/amx_api.c
===
--- clang/test/CodeGen/X86/amx_api.c
+++ clang/test/CodeGen/X86/amx_api.c
@@ -81,9 +81,9 @@
   __tile_zero();
 }
 
-void test_tile_tdpbf16ps(__tile1024i a, __tile1024i b, __tile1024i c) {
-  //CHECK-LABEL: @test_tile_tdpbf16ps
+void test_tile_dpbf16ps(__tile1024i a, __tile1024i b, __tile1024i c) {
+  //CHECK-LABEL: @test_tile_dpbf16ps
   //CHECK: call x86_amx @llvm.x86.tdpbf16ps.internal
   //CHECK-NEXT: {{%.*}} = bitcast x86_amx {{%.*}} to <256 x i32>
-  __tile_tdpbf16ps(, b, c);
+  __tile_dpbf16ps(, b, c);
 }
Index: clang/lib/Headers/amxintrin.h
===
--- clang/lib/Headers/amxintrin.h
+++ clang/lib/Headers/amxintrin.h
@@ -267,8 +267,8 @@
 }
 
 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_BF16
-_tile_tdpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,
- _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+_tile_dpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,
+_tile1024i dst, _tile1024i src1, _tile1024i src2) {
   return __builtin_ia32_tdpbf16ps_internal(m, n, k, dst, src1, src2);
 }
 
@@ -323,10 +323,10 @@
 }
 
 __DEFAULT_FN_ATTRS_BF16
-static void __tile_tdpbf16ps(__tile1024i *dst, __tile1024i src1,
- __tile1024i src2) {
-  dst->tile = _tile_tdpbf16ps_internal(src1.row, src2.col, src1.col, dst->tile,
-   src1.tile, src2.tile);
+static void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src1,
+__tile1024i src2) {
+  dst->tile = _tile_dpbf16ps_internal(src1.row, src2.col, src1.col, dst->tile,
+  src1.tile, src2.tile);
 }
 
 #undef __DEFAULT_FN_ATTRS_TILE
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D93594: [X86] Pass to transform amx intrinsics to scalar operation.

2021-03-16 Thread Bing Yu via Phabricator via cfe-commits
yubing added a comment.

In D93594#2628497 , @nikic wrote:

> It looks like this has caused a compile-time regression at `O0`: 
> https://llvm-compile-time-tracker.com/compare.php?from=9341bcbdc93a251b632ffaa51a84452a7a4a5e4e=4f198b0c27b04e830a3069aaf4b39cf203eaae4a=instructions
>
> The cause is probably the computation of DomTree and LoopInfo, even if no AMX 
> intrinsics are present. I think you should be able to easily fix this by not 
> fetching DT/LI from the pass manager, and computing them in the pass instead 
> (only if intrinsics are present).

Thanks, @nikic, I will fix it ASAP. Besides, How could I reproduce the 
regression?
Eh, I am asking these question because I think I should see if the repression 
can't be reproduced with my future bugfix.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D93594/new/

https://reviews.llvm.org/D93594

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D96110: [X86] Pass to transform tdpbf16ps intrinsics to scalar operation.

2021-03-16 Thread Bing Yu via Phabricator via cfe-commits
yubing updated this revision to Diff 330898.
yubing added a comment.

just do a rebase


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D96110/new/

https://reviews.llvm.org/D96110

Files:
  llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
  llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll

Index: llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
===
--- llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
+++ llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
@@ -97,8 +97,8 @@
   ret void
 }
 
-define dso_local void @test_amx_dp(i16 signext %row, i16 signext %col, i16 signext %k, <256 x i32> %c, <256 x i32> %a, <256 x i32> %b, <256 x i32>* %vptr) #0 {
-; CHECK-LABEL: @test_amx_dp(
+define dso_local void @test_amx_dpbssd(i16 signext %row, i16 signext %col, i16 signext %k, <256 x i32> %c, <256 x i32> %a, <256 x i32> %b, <256 x i32>* %vptr) #0 {
+; CHECK-LABEL: @test_amx_dpbssd(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:[[A_AMX:%.*]] = bitcast <256 x i32> [[A:%.*]] to x86_amx
 ; CHECK-NEXT:[[B_AMX:%.*]] = bitcast <256 x i32> [[B:%.*]] to x86_amx
@@ -172,6 +172,84 @@
   ret void
 }
 
+define dso_local void @test_amx_dpbf16ps(i16 signext %row, i16 signext %col, i16 signext %k, <256 x i32> %c, <256 x i32> %a, <256 x i32> %b, <256 x i32>* %vptr) #0 {
+; CHECK-LABEL: @test_amx_dpbf16ps(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:[[A_AMX:%.*]] = bitcast <256 x i32> [[A:%.*]] to x86_amx
+; CHECK-NEXT:[[B_AMX:%.*]] = bitcast <256 x i32> [[B:%.*]] to x86_amx
+; CHECK-NEXT:[[C_AMX:%.*]] = bitcast <256 x i32> [[C:%.*]] to x86_amx
+; CHECK-NEXT:[[TMP0:%.*]] = lshr i16 [[COL:%.*]], 2
+; CHECK-NEXT:[[TMP1:%.*]] = lshr i16 [[K:%.*]], 2
+; CHECK-NEXT:br label [[TDPBF16PS_SCALARIZE_ROWS_HEADER:%.*]]
+; CHECK:   tdpbf16ps.scalarize.rows.header:
+; CHECK-NEXT:[[TDPBF16PS_SCALARIZE_ROWS_IV:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TDPBF16PS_SCALARIZE_ROWS_STEP:%.*]], [[TDPBF16PS_SCALARIZE_ROWS_LATCH:%.*]] ]
+; CHECK-NEXT:[[VEC_C_PHI_ROW:%.*]] = phi <256 x i32> [ [[C]], [[ENTRY]] ], [ [[TMP21:%.*]], [[TDPBF16PS_SCALARIZE_ROWS_LATCH]] ]
+; CHECK-NEXT:[[VEC_D_PHI_ROW:%.*]] = phi <256 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[TMP23:%.*]], [[TDPBF16PS_SCALARIZE_ROWS_LATCH]] ]
+; CHECK-NEXT:br label [[TDPBF16PS_SCALARIZE_ROWS_BODY:%.*]]
+; CHECK:   tdpbf16ps.scalarize.rows.body:
+; CHECK-NEXT:br label [[TDPBF16PS_SCALARIZE_COLS_HEADER:%.*]]
+; CHECK:   tdpbf16ps.scalarize.cols.header:
+; CHECK-NEXT:[[TDPBF16PS_SCALARIZE_COLS_IV:%.*]] = phi i16 [ 0, [[TDPBF16PS_SCALARIZE_ROWS_BODY]] ], [ [[TDPBF16PS_SCALARIZE_COLS_STEP:%.*]], [[TDPBF16PS_SCALARIZE_COLS_LATCH:%.*]] ]
+; CHECK-NEXT:[[VEC_C_PHI_COL:%.*]] = phi <256 x i32> [ [[VEC_C_PHI_ROW]], [[TDPBF16PS_SCALARIZE_ROWS_BODY]] ], [ [[TMP21]], [[TDPBF16PS_SCALARIZE_COLS_LATCH]] ]
+; CHECK-NEXT:[[VEC_D_PHI_COL:%.*]] = phi <256 x i32> [ [[VEC_D_PHI_ROW]], [[TDPBF16PS_SCALARIZE_ROWS_BODY]] ], [ [[TMP23]], [[TDPBF16PS_SCALARIZE_COLS_LATCH]] ]
+; CHECK-NEXT:[[TMP2:%.*]] = mul i16 [[TDPBF16PS_SCALARIZE_ROWS_IV]], 16
+; CHECK-NEXT:[[TMP3:%.*]] = add i16 [[TMP2]], [[TDPBF16PS_SCALARIZE_COLS_IV]]
+; CHECK-NEXT:br label [[TDPBF16PS_SCALARIZE_COLS_BODY:%.*]]
+; CHECK:   tdpbf16ps.scalarize.cols.body:
+; CHECK-NEXT:br label [[TDPBF16PS_SCALARIZE_INNER_HEADER:%.*]]
+; CHECK:   tdpbf16ps.scalarize.inner.header:
+; CHECK-NEXT:[[TDPBF16PS_SCALARIZE_INNER_IV:%.*]] = phi i16 [ 0, [[TDPBF16PS_SCALARIZE_COLS_BODY]] ], [ [[TDPBF16PS_SCALARIZE_INNER_STEP:%.*]], [[TDPBF16PS_SCALARIZE_INNER_LATCH:%.*]] ]
+; CHECK-NEXT:[[VEC_C_INNER_PHI:%.*]] = phi <256 x i32> [ [[VEC_C_PHI_COL]], [[TDPBF16PS_SCALARIZE_COLS_BODY]] ], [ [[TMP21]], [[TDPBF16PS_SCALARIZE_INNER_LATCH]] ]
+; CHECK-NEXT:br label [[TDPBF16PS_SCALARIZE_INNER_BODY:%.*]]
+; CHECK:   tdpbf16ps.scalarize.inner.body:
+; CHECK-NEXT:[[TMP4:%.*]] = mul i16 [[TDPBF16PS_SCALARIZE_ROWS_IV]], 16
+; CHECK-NEXT:[[TMP5:%.*]] = add i16 [[TMP4]], [[TDPBF16PS_SCALARIZE_INNER_IV]]
+; CHECK-NEXT:[[TMP6:%.*]] = mul i16 [[TDPBF16PS_SCALARIZE_INNER_IV]], 16
+; CHECK-NEXT:[[TMP7:%.*]] = add i16 [[TMP6]], [[TDPBF16PS_SCALARIZE_COLS_IV]]
+; CHECK-NEXT:[[TMP8:%.*]] = extractelement <256 x i32> [[VEC_C_INNER_PHI]], i16 [[TMP3]]
+; CHECK-NEXT:[[TMP9:%.*]] = bitcast i32 [[TMP8]] to float
+; CHECK-NEXT:[[TMP10:%.*]] = extractelement <256 x i32> [[A]], i16 [[TMP5]]
+; CHECK-NEXT:[[TMP11:%.*]] = bitcast i32 [[TMP10]] to <2 x i16>
+; CHECK-NEXT:[[TMP12:%.*]] = extractelement <256 x i32> [[B]], i16 [[TMP7]]
+; CHECK-NEXT:[[TMP13:%.*]] = bitcast i32 [[TMP12]] to <2 x i16>
+; CHECK-NEXT:[[TMP14:%.*]] = shufflevector <2 x i16> [[TMP11]], <2 x i16> zeroinitializer, <4 x i32> 
+; CHECK-NEXT:[[TMP15:%.*]] = bitcast <4 x i16> [[TMP14]] to <2 x float>
+; CHECK-NEXT:[[TMP16:%.*]] = shufflevector <2 x i16> [[TMP13]], <2 x i16> zeroinitializer, <4 x 

[PATCH] D98685: [X86][AMX] Rename amx-bf16 intrinsic according to correct naming convention __tile_tdpbf16ps should be renamed with __tile_dpbf16ps

2021-03-16 Thread Bing Yu via Phabricator via cfe-commits
yubing created this revision.
Herald added a subscriber: pengfei.
yubing requested review of this revision.
Herald added a project: clang.
Herald added a subscriber: cfe-commits.

Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D98685

Files:
  clang/lib/Headers/amxintrin.h
  clang/test/CodeGen/X86/amx_api.c


Index: clang/test/CodeGen/X86/amx_api.c
===
--- clang/test/CodeGen/X86/amx_api.c
+++ clang/test/CodeGen/X86/amx_api.c
@@ -81,9 +81,9 @@
   __tile_zero();
 }
 
-void test_tile_tdpbf16ps(__tile1024i a, __tile1024i b, __tile1024i c) {
-  //CHECK-LABEL: @test_tile_tdpbf16ps
+void test_tile_dpbf16ps(__tile1024i a, __tile1024i b, __tile1024i c) {
+  //CHECK-LABEL: @test_tile_dpbf16ps
   //CHECK: call x86_amx @llvm.x86.tdpbf16ps.internal
   //CHECK-NEXT: {{%.*}} = bitcast x86_amx {{%.*}} to <256 x i32>
-  __tile_tdpbf16ps(, b, c);
+  __tile_dpbf16ps(, b, c);
 }
Index: clang/lib/Headers/amxintrin.h
===
--- clang/lib/Headers/amxintrin.h
+++ clang/lib/Headers/amxintrin.h
@@ -267,8 +267,8 @@
 }
 
 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_BF16
-_tile_tdpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,
- _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+_tile_dpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,
+_tile1024i dst, _tile1024i src1, _tile1024i src2) {
   return __builtin_ia32_tdpbf16ps_internal(m, n, k, dst, src1, src2);
 }
 
@@ -323,10 +323,10 @@
 }
 
 __DEFAULT_FN_ATTRS_BF16
-static void __tile_tdpbf16ps(__tile1024i *dst, __tile1024i src1,
- __tile1024i src2) {
-  dst->tile = _tile_tdpbf16ps_internal(src1.row, src2.col, src1.col, dst->tile,
-   src1.tile, src2.tile);
+static void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src1,
+__tile1024i src2) {
+  dst->tile = _tile_dpbf16ps_internal(src1.row, src2.col, src1.col, dst->tile,
+  src1.tile, src2.tile);
 }
 
 #undef __DEFAULT_FN_ATTRS_TILE


Index: clang/test/CodeGen/X86/amx_api.c
===
--- clang/test/CodeGen/X86/amx_api.c
+++ clang/test/CodeGen/X86/amx_api.c
@@ -81,9 +81,9 @@
   __tile_zero();
 }
 
-void test_tile_tdpbf16ps(__tile1024i a, __tile1024i b, __tile1024i c) {
-  //CHECK-LABEL: @test_tile_tdpbf16ps
+void test_tile_dpbf16ps(__tile1024i a, __tile1024i b, __tile1024i c) {
+  //CHECK-LABEL: @test_tile_dpbf16ps
   //CHECK: call x86_amx @llvm.x86.tdpbf16ps.internal
   //CHECK-NEXT: {{%.*}} = bitcast x86_amx {{%.*}} to <256 x i32>
-  __tile_tdpbf16ps(, b, c);
+  __tile_dpbf16ps(, b, c);
 }
Index: clang/lib/Headers/amxintrin.h
===
--- clang/lib/Headers/amxintrin.h
+++ clang/lib/Headers/amxintrin.h
@@ -267,8 +267,8 @@
 }
 
 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_BF16
-_tile_tdpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,
- _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+_tile_dpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,
+_tile1024i dst, _tile1024i src1, _tile1024i src2) {
   return __builtin_ia32_tdpbf16ps_internal(m, n, k, dst, src1, src2);
 }
 
@@ -323,10 +323,10 @@
 }
 
 __DEFAULT_FN_ATTRS_BF16
-static void __tile_tdpbf16ps(__tile1024i *dst, __tile1024i src1,
- __tile1024i src2) {
-  dst->tile = _tile_tdpbf16ps_internal(src1.row, src2.col, src1.col, dst->tile,
-   src1.tile, src2.tile);
+static void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src1,
+__tile1024i src2) {
+  dst->tile = _tile_dpbf16ps_internal(src1.row, src2.col, src1.col, dst->tile,
+  src1.tile, src2.tile);
 }
 
 #undef __DEFAULT_FN_ATTRS_TILE
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D97358: [X86] Support amx-bf16 intrinsic.

2021-03-16 Thread Bing Yu via Phabricator via cfe-commits
yubing added inline comments.



Comment at: clang/lib/Headers/amxintrin.h:326
+__DEFAULT_FN_ATTRS_BF16
+static void __tile_tdpbf16ps(__tile1024i *dst, __tile1024i src1,
+ __tile1024i src2) {

Should we align this with "tile_dpbssd" by renaming it wth "tile_dpbf16ps"?


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D97358/new/

https://reviews.llvm.org/D97358

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D93594: [X86] Pass to transform amx intrinsics to scalar operation.

2021-03-15 Thread Bing Yu via Phabricator via cfe-commits
This revision was landed with ongoing or failed builds.
This revision was automatically updated to reflect the committed changes.
Closed by commit rG4f198b0c27b0: [X86] Pass to transform amx intrinsics to 
scalar operation. (authored by yubing).

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D93594/new/

https://reviews.llvm.org/D93594

Files:
  llvm/include/llvm/CodeGen/Passes.h
  llvm/lib/Target/X86/CMakeLists.txt
  llvm/lib/Target/X86/X86.h
  llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
  llvm/lib/Target/X86/X86LowerAMXType.cpp
  llvm/lib/Target/X86/X86TargetMachine.cpp
  llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll
  llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
  llvm/test/CodeGen/X86/AMX/amx-type.ll
  llvm/test/CodeGen/X86/O0-pipeline.ll
  llvm/test/CodeGen/X86/opt-pipeline.ll
  llvm/tools/opt/opt.cpp

Index: llvm/tools/opt/opt.cpp
===
--- llvm/tools/opt/opt.cpp
+++ llvm/tools/opt/opt.cpp
@@ -520,7 +520,8 @@
   "expand-reductions","indirectbr-expand",
   "generic-to-nvvm",  "expandmemcmp",
   "loop-reduce",  "lower-amx-type",
-  "polyhedral-info",  "replace-with-veclib"};
+  "lower-amx-intrinsics", "polyhedral-info",
+  "replace-with-veclib"};
   for (const auto  : PassNamePrefix)
 if (Pass.startswith(P))
   return true;
Index: llvm/test/CodeGen/X86/opt-pipeline.ll
===
--- llvm/test/CodeGen/X86/opt-pipeline.ll
+++ llvm/test/CodeGen/X86/opt-pipeline.ll
@@ -24,11 +24,12 @@
 ; CHECK-NEXT: Pre-ISel Intrinsic Lowering
 ; CHECK-NEXT: FunctionPass Manager
 ; CHECK-NEXT:   Expand Atomic instructions
+; CHECK-NEXT:   Dominator Tree Construction
+; CHECK-NEXT:   Natural Loop Information
+; CHECK-NEXT:   Lower AMX intrinsics
 ; CHECK-NEXT:   Lower AMX type for load/store
 ; CHECK-NEXT:   Module Verifier
-; CHECK-NEXT:   Dominator Tree Construction
 ; CHECK-NEXT:   Basic Alias Analysis (stateless AA impl)
-; CHECK-NEXT:   Natural Loop Information
 ; CHECK-NEXT:   Canonicalize natural loops
 ; CHECK-NEXT:   Scalar Evolution Analysis
 ; CHECK-NEXT:   Loop Pass Manager
Index: llvm/test/CodeGen/X86/O0-pipeline.ll
===
--- llvm/test/CodeGen/X86/O0-pipeline.ll
+++ llvm/test/CodeGen/X86/O0-pipeline.ll
@@ -18,6 +18,9 @@
 ; CHECK-NEXT: Pre-ISel Intrinsic Lowering
 ; CHECK-NEXT: FunctionPass Manager
 ; CHECK-NEXT:   Expand Atomic instructions
+; CHECK-NEXT:   Dominator Tree Construction
+; CHECK-NEXT:   Natural Loop Information
+; CHECK-NEXT:   Lower AMX intrinsics
 ; CHECK-NEXT:   Lower AMX type for load/store
 ; CHECK-NEXT:   Module Verifier
 ; CHECK-NEXT:   Lower Garbage Collection Instructions
Index: llvm/test/CodeGen/X86/AMX/amx-type.ll
===
--- llvm/test/CodeGen/X86/AMX/amx-type.ll
+++ llvm/test/CodeGen/X86/AMX/amx-type.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -lower-amx-type %s -S | FileCheck %s
+; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -lower-amx-type %s -S | FileCheck %s
 
 %struct.__tile_str = type { i16, i16, <256 x i32> }
 
Index: llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
===
--- /dev/null
+++ llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
@@ -0,0 +1,237 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mtriple=x86_64 -lower-amx-intrinsics %s -S | FileCheck %s
+
+define dso_local void @test_amx_load_non_O0(i16 signext %row, i16 signext %col, i8 *%ptr, i64 %stride, <256 x i32>* %vptr) {
+; CHECK-LABEL: @test_amx_load_non_O0(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:[[TMP0:%.*]] = lshr i16 [[COL:%.*]], 2
+; CHECK-NEXT:[[TMP1:%.*]] = lshr i64 [[STRIDE:%.*]], 2
+; CHECK-NEXT:br label [[TILELOAD_SCALARIZE_ROWS_HEADER:%.*]]
+; CHECK:   tileload.scalarize.rows.header:
+; CHECK-NEXT:[[TILELOAD_SCALARIZE_ROWS_IV:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TILELOAD_SCALARIZE_ROWS_STEP:%.*]], [[TILELOAD_SCALARIZE_ROWS_LATCH:%.*]] ]
+; CHECK-NEXT:[[VEC_PHI_ROW:%.*]] = phi <256 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[TMP11:%.*]], [[TILELOAD_SCALARIZE_ROWS_LATCH]] ]
+; CHECK-NEXT:br label [[TILELOAD_SCALARIZE_ROWS_BODY:%.*]]
+; CHECK:   tileload.scalarize.rows.body:
+; CHECK-NEXT:br label [[TILELOAD_SCALARIZE_COLS_HEADER:%.*]]
+; CHECK:   tileload.scalarize.cols.header:
+; CHECK-NEXT:[[TILELOAD_SCALARIZE_COLS_IV:%.*]] = phi i16 [ 0, [[TILELOAD_SCALARIZE_ROWS_BODY]] ], [ [[TILELOAD_SCALARIZE_COLS_STEP:%.*]], [[TILELOAD_SCALARIZE_COLS_LATCH:%.*]] ]
+; CHECK-NEXT:[[VEC_PHI:%.*]] = phi <256 x i32> [ [[VEC_PHI_ROW]], 

[PATCH] D93594: [X86] Pass to transform amx intrinsics to scalar operation.

2021-03-08 Thread Bing Yu via Phabricator via cfe-commits
yubing updated this revision to Diff 329204.
yubing added a comment.

Fix buildfail when it is -DBUILD_SHARED_LIBS=ON


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D93594/new/

https://reviews.llvm.org/D93594

Files:
  llvm/include/llvm/CodeGen/Passes.h
  llvm/lib/Target/X86/CMakeLists.txt
  llvm/lib/Target/X86/X86.h
  llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
  llvm/lib/Target/X86/X86LowerAMXType.cpp
  llvm/lib/Target/X86/X86TargetMachine.cpp
  llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll
  llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
  llvm/test/CodeGen/X86/AMX/amx-type.ll
  llvm/test/CodeGen/X86/O0-pipeline.ll
  llvm/test/CodeGen/X86/opt-pipeline.ll
  llvm/tools/opt/opt.cpp

Index: llvm/tools/opt/opt.cpp
===
--- llvm/tools/opt/opt.cpp
+++ llvm/tools/opt/opt.cpp
@@ -513,7 +513,8 @@
   "expand-reductions","indirectbr-expand",
   "generic-to-nvvm",  "expandmemcmp",
   "loop-reduce",  "lower-amx-type",
-  "polyhedral-info",  "replace-with-veclib"};
+  "lower-amx-intrinsics", "polyhedral-info",
+  "replace-with-veclib"};
   for (const auto  : PassNamePrefix)
 if (Pass.startswith(P))
   return true;
Index: llvm/test/CodeGen/X86/opt-pipeline.ll
===
--- llvm/test/CodeGen/X86/opt-pipeline.ll
+++ llvm/test/CodeGen/X86/opt-pipeline.ll
@@ -24,11 +24,12 @@
 ; CHECK-NEXT: Pre-ISel Intrinsic Lowering
 ; CHECK-NEXT: FunctionPass Manager
 ; CHECK-NEXT:   Expand Atomic instructions
+; CHECK-NEXT:   Dominator Tree Construction
+; CHECK-NEXT:   Natural Loop Information
+; CHECK-NEXT:   Lower AMX intrinsics
 ; CHECK-NEXT:   Lower AMX type for load/store
 ; CHECK-NEXT:   Module Verifier
-; CHECK-NEXT:   Dominator Tree Construction
 ; CHECK-NEXT:   Basic Alias Analysis (stateless AA impl)
-; CHECK-NEXT:   Natural Loop Information
 ; CHECK-NEXT:   Canonicalize natural loops
 ; CHECK-NEXT:   Scalar Evolution Analysis
 ; CHECK-NEXT:   Loop Pass Manager
Index: llvm/test/CodeGen/X86/O0-pipeline.ll
===
--- llvm/test/CodeGen/X86/O0-pipeline.ll
+++ llvm/test/CodeGen/X86/O0-pipeline.ll
@@ -18,6 +18,9 @@
 ; CHECK-NEXT: Pre-ISel Intrinsic Lowering
 ; CHECK-NEXT: FunctionPass Manager
 ; CHECK-NEXT:   Expand Atomic instructions
+; CHECK-NEXT:   Dominator Tree Construction
+; CHECK-NEXT:   Natural Loop Information
+; CHECK-NEXT:   Lower AMX intrinsics
 ; CHECK-NEXT:   Lower AMX type for load/store
 ; CHECK-NEXT:   Module Verifier
 ; CHECK-NEXT:   Lower Garbage Collection Instructions
Index: llvm/test/CodeGen/X86/AMX/amx-type.ll
===
--- llvm/test/CodeGen/X86/AMX/amx-type.ll
+++ llvm/test/CodeGen/X86/AMX/amx-type.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -lower-amx-type %s -S | FileCheck %s
+; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -lower-amx-type %s -S | FileCheck %s
 
 %struct.__tile_str = type { i16, i16, <256 x i32> }
 
Index: llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
===
--- /dev/null
+++ llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
@@ -0,0 +1,237 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mtriple=x86_64 -lower-amx-intrinsics %s -S | FileCheck %s
+
+define dso_local void @test_amx_load_non_O0(i16 signext %row, i16 signext %col, i8 *%ptr, i64 %stride, <256 x i32>* %vptr) {
+; CHECK-LABEL: @test_amx_load_non_O0(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:[[TMP0:%.*]] = lshr i16 [[COL:%.*]], 2
+; CHECK-NEXT:[[TMP1:%.*]] = lshr i64 [[STRIDE:%.*]], 2
+; CHECK-NEXT:br label [[TILELOAD_SCALARIZE_ROWS_HEADER:%.*]]
+; CHECK:   tileload.scalarize.rows.header:
+; CHECK-NEXT:[[TILELOAD_SCALARIZE_ROWS_IV:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TILELOAD_SCALARIZE_ROWS_STEP:%.*]], [[TILELOAD_SCALARIZE_ROWS_LATCH:%.*]] ]
+; CHECK-NEXT:[[VEC_PHI_ROW:%.*]] = phi <256 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[TMP11:%.*]], [[TILELOAD_SCALARIZE_ROWS_LATCH]] ]
+; CHECK-NEXT:br label [[TILELOAD_SCALARIZE_ROWS_BODY:%.*]]
+; CHECK:   tileload.scalarize.rows.body:
+; CHECK-NEXT:br label [[TILELOAD_SCALARIZE_COLS_HEADER:%.*]]
+; CHECK:   tileload.scalarize.cols.header:
+; CHECK-NEXT:[[TILELOAD_SCALARIZE_COLS_IV:%.*]] = phi i16 [ 0, [[TILELOAD_SCALARIZE_ROWS_BODY]] ], [ [[TILELOAD_SCALARIZE_COLS_STEP:%.*]], [[TILELOAD_SCALARIZE_COLS_LATCH:%.*]] ]
+; CHECK-NEXT:[[VEC_PHI:%.*]] = phi <256 x i32> [ [[VEC_PHI_ROW]], [[TILELOAD_SCALARIZE_ROWS_BODY]] ], [ [[TMP11]], [[TILELOAD_SCALARIZE_COLS_LATCH]] ]
+; CHECK-NEXT:br label [[TILELOAD_SCALARIZE_COLS_BODY:%.*]]
+; 

[PATCH] D93594: [X86] Pass to transform amx intrinsics to scalar operation.

2021-03-08 Thread Bing Yu via Phabricator via cfe-commits
yubing added a comment.

In D93594#2606157 , @RKSimon wrote:

> @yubing I've reverted this as it was failing on a lot of buildbots: 
> http://lab.llvm.org:8011/#/builders/109/builds/9867

Hi, @RKSimon @nicolasvasilache , it seems we haven't told 
libLLVMX86CodeGen.so.13git to link TransformUtils 
inllvm/lib/Target/X86/CMakeLists.txt, That's why we encounter buildfail.
But There is a strange thing which can be observed in build.ninja :
When I cmake  with "-DBUILD_SHARED_LIBS=OFF", libLLVMX86CodeGen.a will still 
link lib/libLLVMTransformUtils.a.
When I cmake  with "-DBUILD_SHARED_LIBS=ON", libLLVMX86CodeGen.so.13git won't 
link TransformUtils.
Is there any difference in build system for static library and shared library?


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D93594/new/

https://reviews.llvm.org/D93594

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D93594: [X86] Pass to transform amx intrinsics to scalar operation.

2021-03-06 Thread Bing Yu via Phabricator via cfe-commits
yubing added a comment.

Thanks all for reporting and reverting this. I will do bugfix asap.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D93594/new/

https://reviews.llvm.org/D93594

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D93594: [X86] Pass to transform amx intrinsics to scalar operation.

2021-03-05 Thread Bing Yu via Phabricator via cfe-commits
This revision was landed with ongoing or failed builds.
This revision was automatically updated to reflect the committed changes.
Closed by commit rG8198d83965ba: [X86] Pass to transform amx intrinsics to 
scalar operation. (authored by LuoYuanke, committed by yubing).

Changed prior to commit:
  https://reviews.llvm.org/D93594?vs=328408=328412#toc

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D93594/new/

https://reviews.llvm.org/D93594

Files:
  llvm/include/llvm/CodeGen/Passes.h
  llvm/lib/Target/X86/CMakeLists.txt
  llvm/lib/Target/X86/X86.h
  llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
  llvm/lib/Target/X86/X86LowerAMXType.cpp
  llvm/lib/Target/X86/X86TargetMachine.cpp
  llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll
  llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
  llvm/test/CodeGen/X86/AMX/amx-type.ll
  llvm/test/CodeGen/X86/O0-pipeline.ll
  llvm/test/CodeGen/X86/opt-pipeline.ll
  llvm/tools/opt/opt.cpp

Index: llvm/tools/opt/opt.cpp
===
--- llvm/tools/opt/opt.cpp
+++ llvm/tools/opt/opt.cpp
@@ -513,7 +513,8 @@
   "expand-reductions","indirectbr-expand",
   "generic-to-nvvm",  "expandmemcmp",
   "loop-reduce",  "lower-amx-type",
-  "polyhedral-info",  "replace-with-veclib"};
+  "lower-amx-intrinsics", "polyhedral-info",
+  "replace-with-veclib"};
   for (const auto  : PassNamePrefix)
 if (Pass.startswith(P))
   return true;
Index: llvm/test/CodeGen/X86/opt-pipeline.ll
===
--- llvm/test/CodeGen/X86/opt-pipeline.ll
+++ llvm/test/CodeGen/X86/opt-pipeline.ll
@@ -24,11 +24,12 @@
 ; CHECK-NEXT: Pre-ISel Intrinsic Lowering
 ; CHECK-NEXT: FunctionPass Manager
 ; CHECK-NEXT:   Expand Atomic instructions
+; CHECK-NEXT:   Dominator Tree Construction
+; CHECK-NEXT:   Natural Loop Information
+; CHECK-NEXT:   Lower AMX intrinsics
 ; CHECK-NEXT:   Lower AMX type for load/store
 ; CHECK-NEXT:   Module Verifier
-; CHECK-NEXT:   Dominator Tree Construction
 ; CHECK-NEXT:   Basic Alias Analysis (stateless AA impl)
-; CHECK-NEXT:   Natural Loop Information
 ; CHECK-NEXT:   Canonicalize natural loops
 ; CHECK-NEXT:   Scalar Evolution Analysis
 ; CHECK-NEXT:   Loop Pass Manager
Index: llvm/test/CodeGen/X86/O0-pipeline.ll
===
--- llvm/test/CodeGen/X86/O0-pipeline.ll
+++ llvm/test/CodeGen/X86/O0-pipeline.ll
@@ -18,6 +18,9 @@
 ; CHECK-NEXT: Pre-ISel Intrinsic Lowering
 ; CHECK-NEXT: FunctionPass Manager
 ; CHECK-NEXT:   Expand Atomic instructions
+; CHECK-NEXT:   Dominator Tree Construction
+; CHECK-NEXT:   Natural Loop Information
+; CHECK-NEXT:   Lower AMX intrinsics
 ; CHECK-NEXT:   Lower AMX type for load/store
 ; CHECK-NEXT:   Module Verifier
 ; CHECK-NEXT:   Lower Garbage Collection Instructions
Index: llvm/test/CodeGen/X86/AMX/amx-type.ll
===
--- llvm/test/CodeGen/X86/AMX/amx-type.ll
+++ llvm/test/CodeGen/X86/AMX/amx-type.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -lower-amx-type %s -S | FileCheck %s
+; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -lower-amx-type %s -S | FileCheck %s
 
 %struct.__tile_str = type { i16, i16, <256 x i32> }
 
Index: llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
===
--- /dev/null
+++ llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
@@ -0,0 +1,237 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mtriple=x86_64 -lower-amx-intrinsics %s -S | FileCheck %s
+
+define dso_local void @test_amx_load_non_O0(i16 signext %row, i16 signext %col, i8 *%ptr, i64 %stride, <256 x i32>* %vptr) {
+; CHECK-LABEL: @test_amx_load_non_O0(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:[[TMP0:%.*]] = lshr i16 [[COL:%.*]], 2
+; CHECK-NEXT:[[TMP1:%.*]] = lshr i64 [[STRIDE:%.*]], 2
+; CHECK-NEXT:br label [[TILELOAD_SCALARIZE_ROWS_HEADER:%.*]]
+; CHECK:   tileload.scalarize.rows.header:
+; CHECK-NEXT:[[TILELOAD_SCALARIZE_ROWS_IV:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TILELOAD_SCALARIZE_ROWS_STEP:%.*]], [[TILELOAD_SCALARIZE_ROWS_LATCH:%.*]] ]
+; CHECK-NEXT:[[VEC_PHI_ROW:%.*]] = phi <256 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[TMP11:%.*]], [[TILELOAD_SCALARIZE_ROWS_LATCH]] ]
+; CHECK-NEXT:br label [[TILELOAD_SCALARIZE_ROWS_BODY:%.*]]
+; CHECK:   tileload.scalarize.rows.body:
+; CHECK-NEXT:br label [[TILELOAD_SCALARIZE_COLS_HEADER:%.*]]
+; CHECK:   tileload.scalarize.cols.header:
+; CHECK-NEXT:[[TILELOAD_SCALARIZE_COLS_IV:%.*]] = phi i16 [ 0, [[TILELOAD_SCALARIZE_ROWS_BODY]] ], [ [[TILELOAD_SCALARIZE_COLS_STEP:%.*]], 

[PATCH] D93594: [X86] Pass to transform amx intrinsics to scalar operation.

2021-03-04 Thread Bing Yu via Phabricator via cfe-commits
yubing updated this revision to Diff 328408.
yubing added a comment.

Address pengfei's comments


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D93594/new/

https://reviews.llvm.org/D93594

Files:
  llvm/include/llvm/CodeGen/Passes.h
  llvm/lib/Target/X86/CMakeLists.txt
  llvm/lib/Target/X86/X86.h
  llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
  llvm/lib/Target/X86/X86LowerAMXType.cpp
  llvm/lib/Target/X86/X86TargetMachine.cpp
  llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll
  llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
  llvm/test/CodeGen/X86/AMX/amx-type.ll
  llvm/test/CodeGen/X86/O0-pipeline.ll
  llvm/test/CodeGen/X86/opt-pipeline.ll
  llvm/tools/opt/opt.cpp

Index: llvm/tools/opt/opt.cpp
===
--- llvm/tools/opt/opt.cpp
+++ llvm/tools/opt/opt.cpp
@@ -513,7 +513,8 @@
   "expand-reductions","indirectbr-expand",
   "generic-to-nvvm",  "expandmemcmp",
   "loop-reduce",  "lower-amx-type",
-  "polyhedral-info",  "replace-with-veclib"};
+  "lower-amx-intrinsics" ,"polyhedral-info",
+  "replace-with-veclib"};
   for (const auto  : PassNamePrefix)
 if (Pass.startswith(P))
   return true;
Index: llvm/test/CodeGen/X86/opt-pipeline.ll
===
--- llvm/test/CodeGen/X86/opt-pipeline.ll
+++ llvm/test/CodeGen/X86/opt-pipeline.ll
@@ -24,11 +24,12 @@
 ; CHECK-NEXT: Pre-ISel Intrinsic Lowering
 ; CHECK-NEXT: FunctionPass Manager
 ; CHECK-NEXT:   Expand Atomic instructions
+; CHECK-NEXT:   Dominator Tree Construction
+; CHECK-NEXT:   Natural Loop Information
+; CHECK-NEXT:   Lower AMX intrinsics
 ; CHECK-NEXT:   Lower AMX type for load/store
 ; CHECK-NEXT:   Module Verifier
-; CHECK-NEXT:   Dominator Tree Construction
 ; CHECK-NEXT:   Basic Alias Analysis (stateless AA impl)
-; CHECK-NEXT:   Natural Loop Information
 ; CHECK-NEXT:   Canonicalize natural loops
 ; CHECK-NEXT:   Scalar Evolution Analysis
 ; CHECK-NEXT:   Loop Pass Manager
Index: llvm/test/CodeGen/X86/O0-pipeline.ll
===
--- llvm/test/CodeGen/X86/O0-pipeline.ll
+++ llvm/test/CodeGen/X86/O0-pipeline.ll
@@ -18,6 +18,9 @@
 ; CHECK-NEXT: Pre-ISel Intrinsic Lowering
 ; CHECK-NEXT: FunctionPass Manager
 ; CHECK-NEXT:   Expand Atomic instructions
+; CHECK-NEXT:   Dominator Tree Construction
+; CHECK-NEXT:   Natural Loop Information
+; CHECK-NEXT:   Lower AMX intrinsics
 ; CHECK-NEXT:   Lower AMX type for load/store
 ; CHECK-NEXT:   Module Verifier
 ; CHECK-NEXT:   Lower Garbage Collection Instructions
Index: llvm/test/CodeGen/X86/AMX/amx-type.ll
===
--- llvm/test/CodeGen/X86/AMX/amx-type.ll
+++ llvm/test/CodeGen/X86/AMX/amx-type.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -lower-amx-type %s -S | FileCheck %s
+; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -lower-amx-type %s -S | FileCheck %s
 
 %struct.__tile_str = type { i16, i16, <256 x i32> }
 
Index: llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
===
--- /dev/null
+++ llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
@@ -0,0 +1,237 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mtriple=x86_64 -lower-amx-intrinsics %s -S | FileCheck %s
+
+define dso_local void @test_amx_load_non_O0(i16 signext %row, i16 signext %col, i8 *%ptr, i64 %stride, <256 x i32>* %vptr) {
+; CHECK-LABEL: @test_amx_load_non_O0(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:[[TMP0:%.*]] = lshr i16 [[COL:%.*]], 2
+; CHECK-NEXT:[[TMP1:%.*]] = lshr i64 [[STRIDE:%.*]], 2
+; CHECK-NEXT:br label [[TILELOAD_SCALARIZE_ROWS_HEADER:%.*]]
+; CHECK:   tileload.scalarize.rows.header:
+; CHECK-NEXT:[[TILELOAD_SCALARIZE_ROWS_IV:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TILELOAD_SCALARIZE_ROWS_STEP:%.*]], [[TILELOAD_SCALARIZE_ROWS_LATCH:%.*]] ]
+; CHECK-NEXT:[[VEC_PHI_ROW:%.*]] = phi <256 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[TMP11:%.*]], [[TILELOAD_SCALARIZE_ROWS_LATCH]] ]
+; CHECK-NEXT:br label [[TILELOAD_SCALARIZE_ROWS_BODY:%.*]]
+; CHECK:   tileload.scalarize.rows.body:
+; CHECK-NEXT:br label [[TILELOAD_SCALARIZE_COLS_HEADER:%.*]]
+; CHECK:   tileload.scalarize.cols.header:
+; CHECK-NEXT:[[TILELOAD_SCALARIZE_COLS_IV:%.*]] = phi i16 [ 0, [[TILELOAD_SCALARIZE_ROWS_BODY]] ], [ [[TILELOAD_SCALARIZE_COLS_STEP:%.*]], [[TILELOAD_SCALARIZE_COLS_LATCH:%.*]] ]
+; CHECK-NEXT:[[VEC_PHI:%.*]] = phi <256 x i32> [ [[VEC_PHI_ROW]], [[TILELOAD_SCALARIZE_ROWS_BODY]] ], [ [[TMP11]], [[TILELOAD_SCALARIZE_COLS_LATCH]] ]
+; CHECK-NEXT:br label [[TILELOAD_SCALARIZE_COLS_BODY:%.*]]
+; CHECK:   

[PATCH] D93594: [X86] Pass to transform amx intrinsics to scalar operation.

2021-03-02 Thread Bing Yu via Phabricator via cfe-commits
yubing added inline comments.



Comment at: llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp:311
+  Value *ResElt = B.CreateAdd(EltC, SubVecR);
+  Value *NewVecC = B.CreateInsertElement(VecCPhi, ResElt, IdxC);
+  Value *NewVecD = B.CreateInsertElement(VecDPhi, ResElt, IdxC);

LuoYuanke wrote:
> yubing wrote:
> > pengfei wrote:
> > > yubing wrote:
> > > > pengfei wrote:
> > > > > Is it necessary to insert the ResElt to VecC?
> > > > Yes, it is necessary since you should use updated eltC(aka, Cij) when 
> > > > you are doing matrix dotproduct:
> > > > Cij =Cij+Ai1.*B1j
> > > > Cij =Cij+Ai2.*B2j
> > > > 
> > > > Cij =Cij+AiK.*BKj
> > > But you don't need to update both C and D. Something like the psudo code 
> > > should enough:
> > > ```
> > > for (k : K)
> > >   Dij += Aik * Bkj;
> > > Dij += Cij
> > > ```
> > I change code into the following style, and it can also reduce inner loop's 
> > size:
> > ```
> > for (k : K)
> >   Cij += Aik * Bkj;
> > Dij = Cij
> > ```
> > Besides, I hoist the procedure of calculating (i,j)'s linear index above 
> > inner loops.
> It seems keeping vector C unchanged is simpler. We can eliminate the phi, 
> extract and insert instruction for vector C.
But your solution  still need to update D so D's phi will be kept in the inner 
loops.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D93594/new/

https://reviews.llvm.org/D93594

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D93594: [X86] Pass to transform amx intrinsics to scalar operation.

2021-03-01 Thread Bing Yu via Phabricator via cfe-commits
yubing marked 15 inline comments as done.
yubing added inline comments.



Comment at: llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp:311
+  Value *ResElt = B.CreateAdd(EltC, SubVecR);
+  Value *NewVecC = B.CreateInsertElement(VecCPhi, ResElt, IdxC);
+  Value *NewVecD = B.CreateInsertElement(VecDPhi, ResElt, IdxC);

pengfei wrote:
> yubing wrote:
> > pengfei wrote:
> > > Is it necessary to insert the ResElt to VecC?
> > Yes, it is necessary since you should use updated eltC(aka, Cij) when you 
> > are doing matrix dotproduct:
> > Cij =Cij+Ai1.*B1j
> > Cij =Cij+Ai2.*B2j
> > 
> > Cij =Cij+AiK.*BKj
> But you don't need to update both C and D. Something like the psudo code 
> should enough:
> ```
> for (k : K)
>   Dij += Aik * Bkj;
> Dij += Cij
> ```
I change code into the following style, and it can also reduce inner loop's 
size:
```
for (k : K)
  Cij += Aik * Bkj;
Dij = Cij
```
Besides, I hoist the procedure of calculating (i,j)'s linear index above inner 
loops.



Comment at: llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-bitcast.ll:13
+; CHECK-NEXT:br label [[TILELOAD_SCALARIZE_ROWS_BODY:%.*]]
+; CHECK:   tileload.scalarize.rows.body:
+; CHECK-NEXT:br label [[TILELOAD_SCALARIZE_COLS_HEADER:%.*]]

pengfei wrote:
> It seems the body block is not necessary
In fact, ISEL PASS can merge basicblocks together.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D93594/new/

https://reviews.llvm.org/D93594

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D93594: [X86] Pass to transform amx intrinsics to scalar operation.

2021-03-01 Thread Bing Yu via Phabricator via cfe-commits
yubing updated this revision to Diff 327362.
yubing edited the summary of this revision.
yubing added a comment.

address comments above


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D93594/new/

https://reviews.llvm.org/D93594

Files:
  llvm/include/llvm/CodeGen/Passes.h
  llvm/lib/Target/X86/CMakeLists.txt
  llvm/lib/Target/X86/X86.h
  llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
  llvm/lib/Target/X86/X86LowerAMXType.cpp
  llvm/lib/Target/X86/X86TargetMachine.cpp
  llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll
  llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
  llvm/test/CodeGen/X86/AMX/amx-type.ll
  llvm/test/CodeGen/X86/O0-pipeline.ll
  llvm/test/CodeGen/X86/opt-pipeline.ll
  llvm/tools/opt/opt.cpp

Index: llvm/tools/opt/opt.cpp
===
--- llvm/tools/opt/opt.cpp
+++ llvm/tools/opt/opt.cpp
@@ -497,7 +497,7 @@
   "expand-reductions","indirectbr-expand",
   "generic-to-nvvm",  "expandmemcmp",
   "loop-reduce",  "lower-amx-type",
-  "polyhedral-info"};
+  "lower-amx-intrinsics", "polyhedral-info"};
   for (const auto  : PassNamePrefix)
 if (Pass.startswith(P))
   return true;
Index: llvm/test/CodeGen/X86/opt-pipeline.ll
===
--- llvm/test/CodeGen/X86/opt-pipeline.ll
+++ llvm/test/CodeGen/X86/opt-pipeline.ll
@@ -24,11 +24,12 @@
 ; CHECK-NEXT: Pre-ISel Intrinsic Lowering
 ; CHECK-NEXT: FunctionPass Manager
 ; CHECK-NEXT:   Expand Atomic instructions
+; CHECK-NEXT:   Dominator Tree Construction
+; CHECK-NEXT:   Natural Loop Information
+; CHECK-NEXT:   Lower AMX intrinsics
 ; CHECK-NEXT:   Lower AMX type for load/store
 ; CHECK-NEXT:   Module Verifier
-; CHECK-NEXT:   Dominator Tree Construction
 ; CHECK-NEXT:   Basic Alias Analysis (stateless AA impl)
-; CHECK-NEXT:   Natural Loop Information
 ; CHECK-NEXT:   Canonicalize natural loops
 ; CHECK-NEXT:   Scalar Evolution Analysis
 ; CHECK-NEXT:   Loop Pass Manager
Index: llvm/test/CodeGen/X86/O0-pipeline.ll
===
--- llvm/test/CodeGen/X86/O0-pipeline.ll
+++ llvm/test/CodeGen/X86/O0-pipeline.ll
@@ -18,6 +18,9 @@
 ; CHECK-NEXT: Pre-ISel Intrinsic Lowering
 ; CHECK-NEXT: FunctionPass Manager
 ; CHECK-NEXT:   Expand Atomic instructions
+; CHECK-NEXT:   Dominator Tree Construction
+; CHECK-NEXT:   Natural Loop Information
+; CHECK-NEXT:   Lower AMX intrinsics
 ; CHECK-NEXT:   Lower AMX type for load/store
 ; CHECK-NEXT:   Module Verifier
 ; CHECK-NEXT:   Lower Garbage Collection Instructions
Index: llvm/test/CodeGen/X86/AMX/amx-type.ll
===
--- llvm/test/CodeGen/X86/AMX/amx-type.ll
+++ llvm/test/CodeGen/X86/AMX/amx-type.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -lower-amx-type %s -S | FileCheck %s
+; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -lower-amx-type %s -S | FileCheck %s
 
 %struct.__tile_str = type { i16, i16, <256 x i32> }
 
Index: llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
===
--- /dev/null
+++ llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
@@ -0,0 +1,237 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mtriple=x86_64 -lower-amx-intrinsics %s -S | FileCheck %s
+
+define dso_local void @test_amx_load_non_O0(i16 signext %row, i16 signext %col, i8 *%ptr, i64 %stride, <256 x i32>* %vptr) {
+; CHECK-LABEL: @test_amx_load_non_O0(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:[[TMP0:%.*]] = lshr i16 [[COL:%.*]], 2
+; CHECK-NEXT:[[TMP1:%.*]] = lshr i64 [[STRIDE:%.*]], 2
+; CHECK-NEXT:br label [[TILELOAD_SCALARIZE_ROWS_HEADER:%.*]]
+; CHECK:   tileload.scalarize.rows.header:
+; CHECK-NEXT:[[TILELOAD_SCALARIZE_ROWS_IV:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TILELOAD_SCALARIZE_ROWS_STEP:%.*]], [[TILELOAD_SCALARIZE_ROWS_LATCH:%.*]] ]
+; CHECK-NEXT:[[VEC_PHI_ROW:%.*]] = phi <256 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[TMP11:%.*]], [[TILELOAD_SCALARIZE_ROWS_LATCH]] ]
+; CHECK-NEXT:br label [[TILELOAD_SCALARIZE_ROWS_BODY:%.*]]
+; CHECK:   tileload.scalarize.rows.body:
+; CHECK-NEXT:br label [[TILELOAD_SCALARIZE_COLS_HEADER:%.*]]
+; CHECK:   tileload.scalarize.cols.header:
+; CHECK-NEXT:[[TILELOAD_SCALARIZE_COLS_IV:%.*]] = phi i16 [ 0, [[TILELOAD_SCALARIZE_ROWS_BODY]] ], [ [[TILELOAD_SCALARIZE_COLS_STEP:%.*]], [[TILELOAD_SCALARIZE_COLS_LATCH:%.*]] ]
+; CHECK-NEXT:[[VEC_PHI:%.*]] = phi <256 x i32> [ [[VEC_PHI_ROW]], [[TILELOAD_SCALARIZE_ROWS_BODY]] ], [ [[TMP11]], [[TILELOAD_SCALARIZE_COLS_LATCH]] ]
+; CHECK-NEXT:br label [[TILELOAD_SCALARIZE_COLS_BODY:%.*]]
+; CHECK:   tileload.scalarize.cols.body:

[PATCH] D93594: [X86] Pass to transform amx intrinsics to scalar operation.

2021-02-28 Thread Bing Yu via Phabricator via cfe-commits
yubing added inline comments.



Comment at: llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp:99
+  Loop *RowLoop = LI.AllocateLoop();
+  Loop *ColLoop = LI.AllocateLoop();
+  RowLoop->addChildLoop(ColLoop);

pengfei wrote:
> Not sure how about the arithmetic intrinsics. But at least for load and store 
> intrinsics we can use LLVM intrinsic `llvm.masked.load/store` to reduce the 
> inner loop.
I think We can compose a follow-up patch for this optimization


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D93594/new/

https://reviews.llvm.org/D93594

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D93594: [X86] Pass to transform amx intrinsics to scalar operation.

2021-02-24 Thread Bing Yu via Phabricator via cfe-commits
yubing added inline comments.



Comment at: llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp:311
+  Value *ResElt = B.CreateAdd(EltC, SubVecR);
+  Value *NewVecC = B.CreateInsertElement(VecCPhi, ResElt, IdxC);
+  Value *NewVecD = B.CreateInsertElement(VecDPhi, ResElt, IdxC);

pengfei wrote:
> Is it necessary to insert the ResElt to VecC?
Yes, it is necessary since you should use updated eltC(aka, Cij) when you are 
doing matrix dotproduct:
Cij =Cij+Ai1.*B1j
Cij =Cij+Ai2.*B2j

Cij =Cij+AiK.*BKj


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D93594/new/

https://reviews.llvm.org/D93594

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D93594: [X86] Pass to transform amx intrinsics to scalar operation.

2021-02-23 Thread Bing Yu via Phabricator via cfe-commits
yubing updated this revision to Diff 325964.
yubing added a comment.

Fix some comments and commit message


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D93594/new/

https://reviews.llvm.org/D93594

Files:
  llvm/include/llvm/CodeGen/Passes.h
  llvm/lib/Target/X86/CMakeLists.txt
  llvm/lib/Target/X86/X86.h
  llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
  llvm/lib/Target/X86/X86LowerAMXType.cpp
  llvm/lib/Target/X86/X86TargetMachine.cpp
  llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-bitcast.ll
  llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
  llvm/test/CodeGen/X86/AMX/amx-type.ll
  llvm/test/CodeGen/X86/O0-pipeline.ll
  llvm/test/CodeGen/X86/opt-pipeline.ll
  llvm/tools/opt/opt.cpp

Index: llvm/tools/opt/opt.cpp
===
--- llvm/tools/opt/opt.cpp
+++ llvm/tools/opt/opt.cpp
@@ -497,7 +497,7 @@
   "expand-reductions","indirectbr-expand",
   "generic-to-nvvm",  "expandmemcmp",
   "loop-reduce",  "lower-amx-type",
-  "polyhedral-info"};
+  "lower-amx-intrinsics", "polyhedral-info"};
   for (const auto  : PassNamePrefix)
 if (Pass.startswith(P))
   return true;
Index: llvm/test/CodeGen/X86/opt-pipeline.ll
===
--- llvm/test/CodeGen/X86/opt-pipeline.ll
+++ llvm/test/CodeGen/X86/opt-pipeline.ll
@@ -24,11 +24,12 @@
 ; CHECK-NEXT: Pre-ISel Intrinsic Lowering
 ; CHECK-NEXT: FunctionPass Manager
 ; CHECK-NEXT:   Expand Atomic instructions
+; CHECK-NEXT:   Dominator Tree Construction
+; CHECK-NEXT:   Natural Loop Information
+; CHECK-NEXT:   Lower AMX intrinsics
 ; CHECK-NEXT:   Lower AMX type for load/store
 ; CHECK-NEXT:   Module Verifier
-; CHECK-NEXT:   Dominator Tree Construction
 ; CHECK-NEXT:   Basic Alias Analysis (stateless AA impl)
-; CHECK-NEXT:   Natural Loop Information
 ; CHECK-NEXT:   Canonicalize natural loops
 ; CHECK-NEXT:   Scalar Evolution Analysis
 ; CHECK-NEXT:   Loop Pass Manager
Index: llvm/test/CodeGen/X86/O0-pipeline.ll
===
--- llvm/test/CodeGen/X86/O0-pipeline.ll
+++ llvm/test/CodeGen/X86/O0-pipeline.ll
@@ -18,6 +18,9 @@
 ; CHECK-NEXT: Pre-ISel Intrinsic Lowering
 ; CHECK-NEXT: FunctionPass Manager
 ; CHECK-NEXT:   Expand Atomic instructions
+; CHECK-NEXT:   Dominator Tree Construction
+; CHECK-NEXT:   Natural Loop Information
+; CHECK-NEXT:   Lower AMX intrinsics
 ; CHECK-NEXT:   Lower AMX type for load/store
 ; CHECK-NEXT:   Module Verifier
 ; CHECK-NEXT:   Lower Garbage Collection Instructions
Index: llvm/test/CodeGen/X86/AMX/amx-type.ll
===
--- llvm/test/CodeGen/X86/AMX/amx-type.ll
+++ llvm/test/CodeGen/X86/AMX/amx-type.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -lower-amx-type %s -S | FileCheck %s
+; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -lower-amx-type %s -S | FileCheck %s
 
 %struct.__tile_str = type { i16, i16, <256 x i32> }
 
Index: llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
===
--- /dev/null
+++ llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
@@ -0,0 +1,237 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mtriple=x86_64 -lower-amx-intrinsics %s -S | FileCheck %s
+
+define dso_local void @test_amx_load_non_O0(i16 signext %row, i16 signext %col, i8 *%ptr, i64 %stride, <256 x i32>* %vptr) {
+; CHECK-LABEL: @test_amx_load_non_O0(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:[[TMP0:%.*]] = lshr i16 [[COL:%.*]], 2
+; CHECK-NEXT:[[TMP1:%.*]] = lshr i64 [[STRIDE:%.*]], 2
+; CHECK-NEXT:br label [[TILELOAD_SCALARIZE_ROWS_HEADER:%.*]]
+; CHECK:   tileload.scalarize.rows.header:
+; CHECK-NEXT:[[TILELOAD_SCALARIZE_ROWS_IV:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TILELOAD_SCALARIZE_ROWS_STEP:%.*]], [[TILELOAD_SCALARIZE_ROWS_LATCH:%.*]] ]
+; CHECK-NEXT:[[VEC_PHI_ROW:%.*]] = phi <256 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[TMP11:%.*]], [[TILELOAD_SCALARIZE_ROWS_LATCH]] ]
+; CHECK-NEXT:br label [[TILELOAD_SCALARIZE_ROWS_BODY:%.*]]
+; CHECK:   tileload.scalarize.rows.body:
+; CHECK-NEXT:br label [[TILELOAD_SCALARIZE_COLS_HEADER:%.*]]
+; CHECK:   tileload.scalarize.cols.header:
+; CHECK-NEXT:[[TILELOAD_SCALARIZE_COLS_IV:%.*]] = phi i16 [ 0, [[TILELOAD_SCALARIZE_ROWS_BODY]] ], [ [[TILELOAD_SCALARIZE_COLS_STEP:%.*]], [[TILELOAD_SCALARIZE_COLS_LATCH:%.*]] ]
+; CHECK-NEXT:[[VEC_PHI:%.*]] = phi <256 x i32> [ [[VEC_PHI_ROW]], [[TILELOAD_SCALARIZE_ROWS_BODY]] ], [ [[TMP11]], [[TILELOAD_SCALARIZE_COLS_LATCH]] ]
+; CHECK-NEXT:br label [[TILELOAD_SCALARIZE_COLS_BODY:%.*]]
+; CHECK:   tileload.scalarize.cols.body:
+; CHECK-NEXT:[[TMP2:%.*]] = 

[PATCH] D96110: [X86] Pass to transform tdpbf16ps intrinsics to scalar operation.

2021-02-22 Thread Bing Yu via Phabricator via cfe-commits
yubing updated this revision to Diff 325688.
yubing added a comment.

Modify some comments


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D96110/new/

https://reviews.llvm.org/D96110

Files:
  clang/include/clang/Basic/BuiltinsX86_64.def
  clang/lib/Headers/amxintrin.h
  llvm/include/llvm/IR/IntrinsicsX86.td
  llvm/lib/Target/X86/X86ExpandPseudo.cpp
  llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
  llvm/lib/Target/X86/X86InstrAMX.td
  llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
  llvm/lib/Target/X86/X86LowerAMXType.cpp
  llvm/lib/Target/X86/X86PreTileConfig.cpp
  llvm/lib/Target/X86/X86RegisterInfo.cpp
  llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll

Index: llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
===
--- llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
+++ llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
@@ -97,8 +97,8 @@
   ret void
 }
 
-define dso_local void @test_amx_dp(i16 signext %row, i16 signext %col, i16 signext %k, <256 x i32> %c, <256 x i32> %a, <256 x i32> %b, <256 x i32>* %vptr) #0 {
-; CHECK-LABEL: @test_amx_dp(
+define dso_local void @test_amx_dpbssd(i16 signext %row, i16 signext %col, i16 signext %k, <256 x i32> %c, <256 x i32> %a, <256 x i32> %b, <256 x i32>* %vptr) #0 {
+; CHECK-LABEL: @test_amx_dpbssd(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:[[A_AMX:%.*]] = bitcast <256 x i32> [[A:%.*]] to x86_amx
 ; CHECK-NEXT:[[B_AMX:%.*]] = bitcast <256 x i32> [[B:%.*]] to x86_amx
@@ -172,6 +172,84 @@
   ret void
 }
 
+define dso_local void @test_amx_dpbf16ps(i16 signext %row, i16 signext %col, i16 signext %k, <256 x i32> %c, <256 x i32> %a, <256 x i32> %b, <256 x i32>* %vptr) #0 {
+; CHECK-LABEL: @test_amx_dpbf16ps(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:[[A_AMX:%.*]] = bitcast <256 x i32> [[A:%.*]] to x86_amx
+; CHECK-NEXT:[[B_AMX:%.*]] = bitcast <256 x i32> [[B:%.*]] to x86_amx
+; CHECK-NEXT:[[C_AMX:%.*]] = bitcast <256 x i32> [[C:%.*]] to x86_amx
+; CHECK-NEXT:[[TMP0:%.*]] = lshr i16 [[COL:%.*]], 2
+; CHECK-NEXT:[[TMP1:%.*]] = lshr i16 [[K:%.*]], 2
+; CHECK-NEXT:br label [[TDPBF16PS_SCALARIZE_ROWS_HEADER:%.*]]
+; CHECK:   tdpbf16ps.scalarize.rows.header:
+; CHECK-NEXT:[[TDPBF16PS_SCALARIZE_ROWS_IV:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TDPBF16PS_SCALARIZE_ROWS_STEP:%.*]], [[TDPBF16PS_SCALARIZE_ROWS_LATCH:%.*]] ]
+; CHECK-NEXT:[[VEC_C_PHI_ROW:%.*]] = phi <256 x i32> [ [[C]], [[ENTRY]] ], [ [[TMP21:%.*]], [[TDPBF16PS_SCALARIZE_ROWS_LATCH]] ]
+; CHECK-NEXT:[[VEC_D_PHI_ROW:%.*]] = phi <256 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[TMP22:%.*]], [[TDPBF16PS_SCALARIZE_ROWS_LATCH]] ]
+; CHECK-NEXT:br label [[TDPBF16PS_SCALARIZE_ROWS_BODY:%.*]]
+; CHECK:   tdpbf16ps.scalarize.rows.body:
+; CHECK-NEXT:br label [[TDPBF16PS_SCALARIZE_COLS_HEADER:%.*]]
+; CHECK:   tdpbf16ps.scalarize.cols.header:
+; CHECK-NEXT:[[TDPBF16PS_SCALARIZE_COLS_IV:%.*]] = phi i16 [ 0, [[TDPBF16PS_SCALARIZE_ROWS_BODY]] ], [ [[TDPBF16PS_SCALARIZE_COLS_STEP:%.*]], [[TDPBF16PS_SCALARIZE_COLS_LATCH:%.*]] ]
+; CHECK-NEXT:[[VEC_C_PHI_COL:%.*]] = phi <256 x i32> [ [[VEC_C_PHI_ROW]], [[TDPBF16PS_SCALARIZE_ROWS_BODY]] ], [ [[TMP21]], [[TDPBF16PS_SCALARIZE_COLS_LATCH]] ]
+; CHECK-NEXT:[[VEC_D_PHI_COL:%.*]] = phi <256 x i32> [ [[VEC_D_PHI_ROW]], [[TDPBF16PS_SCALARIZE_ROWS_BODY]] ], [ [[TMP22]], [[TDPBF16PS_SCALARIZE_COLS_LATCH]] ]
+; CHECK-NEXT:br label [[TDPBF16PS_SCALARIZE_COLS_BODY:%.*]]
+; CHECK:   tdpbf16ps.scalarize.cols.body:
+; CHECK-NEXT:br label [[TDPBF16PS_SCALARIZE_INNER_HEADER:%.*]]
+; CHECK:   tdpbf16ps.scalarize.inner.header:
+; CHECK-NEXT:[[TDPBF16PS_SCALARIZE_INNER_IV:%.*]] = phi i16 [ 0, [[TDPBF16PS_SCALARIZE_COLS_BODY]] ], [ [[TDPBF16PS_SCALARIZE_INNER_STEP:%.*]], [[TDPBF16PS_SCALARIZE_INNER_LATCH:%.*]] ]
+; CHECK-NEXT:[[VEC_C_INNER_PHI:%.*]] = phi <256 x i32> [ [[VEC_C_PHI_COL]], [[TDPBF16PS_SCALARIZE_COLS_BODY]] ], [ [[TMP21]], [[TDPBF16PS_SCALARIZE_INNER_LATCH]] ]
+; CHECK-NEXT:[[VEC_D_INNER_PHI:%.*]] = phi <256 x i32> [ [[VEC_D_PHI_COL]], [[TDPBF16PS_SCALARIZE_COLS_BODY]] ], [ [[TMP22]], [[TDPBF16PS_SCALARIZE_INNER_LATCH]] ]
+; CHECK-NEXT:br label [[TDPBF16PS_SCALARIZE_INNER_BODY:%.*]]
+; CHECK:   tdpbf16ps.scalarize.inner.body:
+; CHECK-NEXT:[[TMP2:%.*]] = mul i16 [[TDPBF16PS_SCALARIZE_ROWS_IV]], 16
+; CHECK-NEXT:[[TMP3:%.*]] = add i16 [[TMP2]], [[TDPBF16PS_SCALARIZE_COLS_IV]]
+; CHECK-NEXT:[[TMP4:%.*]] = mul i16 [[TDPBF16PS_SCALARIZE_ROWS_IV]], 16
+; CHECK-NEXT:[[TMP5:%.*]] = add i16 [[TMP4]], [[TDPBF16PS_SCALARIZE_INNER_IV]]
+; CHECK-NEXT:[[TMP6:%.*]] = mul i16 [[TDPBF16PS_SCALARIZE_INNER_IV]], 16
+; CHECK-NEXT:[[TMP7:%.*]] = add i16 [[TMP6]], [[TDPBF16PS_SCALARIZE_COLS_IV]]
+; CHECK-NEXT:[[TMP8:%.*]] = extractelement <256 x i32> [[VEC_C_INNER_PHI]], i16 [[TMP3]]
+; CHECK-NEXT:[[TMP9:%.*]] = bitcast i32 [[TMP8]] to float
+; CHECK-NEXT:[[TMP10:%.*]] = 

[PATCH] D96110: [X86] Pass to transform tdpbf16ps intrinsics to scalar operation.

2021-02-22 Thread Bing Yu via Phabricator via cfe-commits
yubing updated this revision to Diff 325685.
yubing added a comment.

Fix incorrect naming for dpbf16's bb


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D96110/new/

https://reviews.llvm.org/D96110

Files:
  clang/include/clang/Basic/BuiltinsX86_64.def
  clang/lib/Headers/amxintrin.h
  llvm/include/llvm/IR/IntrinsicsX86.td
  llvm/lib/Target/X86/X86ExpandPseudo.cpp
  llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
  llvm/lib/Target/X86/X86InstrAMX.td
  llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
  llvm/lib/Target/X86/X86LowerAMXType.cpp
  llvm/lib/Target/X86/X86PreTileConfig.cpp
  llvm/lib/Target/X86/X86RegisterInfo.cpp
  llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll

Index: llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
===
--- llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
+++ llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
@@ -97,8 +97,8 @@
   ret void
 }
 
-define dso_local void @test_amx_dp(i16 signext %row, i16 signext %col, i16 signext %k, <256 x i32> %c, <256 x i32> %a, <256 x i32> %b, <256 x i32>* %vptr) #0 {
-; CHECK-LABEL: @test_amx_dp(
+define dso_local void @test_amx_dpbssd(i16 signext %row, i16 signext %col, i16 signext %k, <256 x i32> %c, <256 x i32> %a, <256 x i32> %b, <256 x i32>* %vptr) #0 {
+; CHECK-LABEL: @test_amx_dpbssd(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:[[A_AMX:%.*]] = bitcast <256 x i32> [[A:%.*]] to x86_amx
 ; CHECK-NEXT:[[B_AMX:%.*]] = bitcast <256 x i32> [[B:%.*]] to x86_amx
@@ -172,6 +172,84 @@
   ret void
 }
 
+define dso_local void @test_amx_dpbf16ps(i16 signext %row, i16 signext %col, i16 signext %k, <256 x i32> %c, <256 x i32> %a, <256 x i32> %b, <256 x i32>* %vptr) #0 {
+; CHECK-LABEL: @test_amx_dpbf16ps(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:[[A_AMX:%.*]] = bitcast <256 x i32> [[A:%.*]] to x86_amx
+; CHECK-NEXT:[[B_AMX:%.*]] = bitcast <256 x i32> [[B:%.*]] to x86_amx
+; CHECK-NEXT:[[C_AMX:%.*]] = bitcast <256 x i32> [[C:%.*]] to x86_amx
+; CHECK-NEXT:[[TMP0:%.*]] = lshr i16 [[COL:%.*]], 2
+; CHECK-NEXT:[[TMP1:%.*]] = lshr i16 [[K:%.*]], 2
+; CHECK-NEXT:br label [[TDPBF16PS_SCALARIZE_ROWS_HEADER:%.*]]
+; CHECK:   tdpbf16ps.scalarize.rows.header:
+; CHECK-NEXT:[[TDPBF16PS_SCALARIZE_ROWS_IV:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TDPBF16PS_SCALARIZE_ROWS_STEP:%.*]], [[TDPBF16PS_SCALARIZE_ROWS_LATCH:%.*]] ]
+; CHECK-NEXT:[[VEC_C_PHI_ROW:%.*]] = phi <256 x i32> [ [[C]], [[ENTRY]] ], [ [[TMP21:%.*]], [[TDPBF16PS_SCALARIZE_ROWS_LATCH]] ]
+; CHECK-NEXT:[[VEC_D_PHI_ROW:%.*]] = phi <256 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[TMP22:%.*]], [[TDPBF16PS_SCALARIZE_ROWS_LATCH]] ]
+; CHECK-NEXT:br label [[TDPBF16PS_SCALARIZE_ROWS_BODY:%.*]]
+; CHECK:   tdpbf16ps.scalarize.rows.body:
+; CHECK-NEXT:br label [[TDPBF16PS_SCALARIZE_COLS_HEADER:%.*]]
+; CHECK:   tdpbf16ps.scalarize.cols.header:
+; CHECK-NEXT:[[TDPBF16PS_SCALARIZE_COLS_IV:%.*]] = phi i16 [ 0, [[TDPBF16PS_SCALARIZE_ROWS_BODY]] ], [ [[TDPBF16PS_SCALARIZE_COLS_STEP:%.*]], [[TDPBF16PS_SCALARIZE_COLS_LATCH:%.*]] ]
+; CHECK-NEXT:[[VEC_C_PHI_COL:%.*]] = phi <256 x i32> [ [[VEC_C_PHI_ROW]], [[TDPBF16PS_SCALARIZE_ROWS_BODY]] ], [ [[TMP21]], [[TDPBF16PS_SCALARIZE_COLS_LATCH]] ]
+; CHECK-NEXT:[[VEC_D_PHI_COL:%.*]] = phi <256 x i32> [ [[VEC_D_PHI_ROW]], [[TDPBF16PS_SCALARIZE_ROWS_BODY]] ], [ [[TMP22]], [[TDPBF16PS_SCALARIZE_COLS_LATCH]] ]
+; CHECK-NEXT:br label [[TDPBF16PS_SCALARIZE_COLS_BODY:%.*]]
+; CHECK:   tdpbf16ps.scalarize.cols.body:
+; CHECK-NEXT:br label [[TDPBF16PS_SCALARIZE_INNER_HEADER:%.*]]
+; CHECK:   tdpbf16ps.scalarize.inner.header:
+; CHECK-NEXT:[[TDPBF16PS_SCALARIZE_INNER_IV:%.*]] = phi i16 [ 0, [[TDPBF16PS_SCALARIZE_COLS_BODY]] ], [ [[TDPBF16PS_SCALARIZE_INNER_STEP:%.*]], [[TDPBF16PS_SCALARIZE_INNER_LATCH:%.*]] ]
+; CHECK-NEXT:[[VEC_C_INNER_PHI:%.*]] = phi <256 x i32> [ [[VEC_C_PHI_COL]], [[TDPBF16PS_SCALARIZE_COLS_BODY]] ], [ [[TMP21]], [[TDPBF16PS_SCALARIZE_INNER_LATCH]] ]
+; CHECK-NEXT:[[VEC_D_INNER_PHI:%.*]] = phi <256 x i32> [ [[VEC_D_PHI_COL]], [[TDPBF16PS_SCALARIZE_COLS_BODY]] ], [ [[TMP22]], [[TDPBF16PS_SCALARIZE_INNER_LATCH]] ]
+; CHECK-NEXT:br label [[TDPBF16PS_SCALARIZE_INNER_BODY:%.*]]
+; CHECK:   tdpbf16ps.scalarize.inner.body:
+; CHECK-NEXT:[[TMP2:%.*]] = mul i16 [[TDPBF16PS_SCALARIZE_ROWS_IV]], 16
+; CHECK-NEXT:[[TMP3:%.*]] = add i16 [[TMP2]], [[TDPBF16PS_SCALARIZE_COLS_IV]]
+; CHECK-NEXT:[[TMP4:%.*]] = mul i16 [[TDPBF16PS_SCALARIZE_ROWS_IV]], 16
+; CHECK-NEXT:[[TMP5:%.*]] = add i16 [[TMP4]], [[TDPBF16PS_SCALARIZE_INNER_IV]]
+; CHECK-NEXT:[[TMP6:%.*]] = mul i16 [[TDPBF16PS_SCALARIZE_INNER_IV]], 16
+; CHECK-NEXT:[[TMP7:%.*]] = add i16 [[TMP6]], [[TDPBF16PS_SCALARIZE_COLS_IV]]
+; CHECK-NEXT:[[TMP8:%.*]] = extractelement <256 x i32> [[VEC_C_INNER_PHI]], i16 [[TMP3]]
+; CHECK-NEXT:[[TMP9:%.*]] = bitcast i32 [[TMP8]] to float
+; CHECK-NEXT:

[PATCH] D96110: [X86] Pass to transform tdpbf16ps intrinsics to scalar operation.

2021-02-22 Thread Bing Yu via Phabricator via cfe-commits
yubing updated this revision to Diff 325683.
yubing added a comment.

Rebase and add a testcase.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D96110/new/

https://reviews.llvm.org/D96110

Files:
  clang/include/clang/Basic/BuiltinsX86_64.def
  clang/lib/Headers/amxintrin.h
  llvm/include/llvm/IR/IntrinsicsX86.td
  llvm/lib/Target/X86/X86ExpandPseudo.cpp
  llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
  llvm/lib/Target/X86/X86InstrAMX.td
  llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
  llvm/lib/Target/X86/X86LowerAMXType.cpp
  llvm/lib/Target/X86/X86PreTileConfig.cpp
  llvm/lib/Target/X86/X86RegisterInfo.cpp
  llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll

Index: llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
===
--- llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
+++ llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
@@ -97,8 +97,8 @@
   ret void
 }
 
-define dso_local void @test_amx_dp(i16 signext %row, i16 signext %col, i16 signext %k, <256 x i32> %c, <256 x i32> %a, <256 x i32> %b, <256 x i32>* %vptr) #0 {
-; CHECK-LABEL: @test_amx_dp(
+define dso_local void @test_amx_dpbssd(i16 signext %row, i16 signext %col, i16 signext %k, <256 x i32> %c, <256 x i32> %a, <256 x i32> %b, <256 x i32>* %vptr) #0 {
+; CHECK-LABEL: @test_amx_dpbssd(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:[[A_AMX:%.*]] = bitcast <256 x i32> [[A:%.*]] to x86_amx
 ; CHECK-NEXT:[[B_AMX:%.*]] = bitcast <256 x i32> [[B:%.*]] to x86_amx
@@ -172,6 +172,84 @@
   ret void
 }
 
+define dso_local void @test_amx_dpbf16ps(i16 signext %row, i16 signext %col, i16 signext %k, <256 x i32> %c, <256 x i32> %a, <256 x i32> %b, <256 x i32>* %vptr) #0 {
+; CHECK-LABEL: @test_amx_dpbf16ps(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:[[A_AMX:%.*]] = bitcast <256 x i32> [[A:%.*]] to x86_amx
+; CHECK-NEXT:[[B_AMX:%.*]] = bitcast <256 x i32> [[B:%.*]] to x86_amx
+; CHECK-NEXT:[[C_AMX:%.*]] = bitcast <256 x i32> [[C:%.*]] to x86_amx
+; CHECK-NEXT:[[TMP0:%.*]] = lshr i16 [[COL:%.*]], 2
+; CHECK-NEXT:[[TMP1:%.*]] = lshr i16 [[K:%.*]], 2
+; CHECK-NEXT:br label [[TILEDPBSSD_SCALARIZE_ROWS_HEADER:%.*]]
+; CHECK:   tiledpbssd.scalarize.rows.header:
+; CHECK-NEXT:[[TILEDPBSSD_SCALARIZE_ROWS_IV:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TILEDPBSSD_SCALARIZE_ROWS_STEP:%.*]], [[TILEDPBSSD_SCALARIZE_ROWS_LATCH:%.*]] ]
+; CHECK-NEXT:[[VEC_C_PHI_ROW:%.*]] = phi <256 x i32> [ [[C]], [[ENTRY]] ], [ [[TMP21:%.*]], [[TILEDPBSSD_SCALARIZE_ROWS_LATCH]] ]
+; CHECK-NEXT:[[VEC_D_PHI_ROW:%.*]] = phi <256 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[TMP22:%.*]], [[TILEDPBSSD_SCALARIZE_ROWS_LATCH]] ]
+; CHECK-NEXT:br label [[TILEDPBSSD_SCALARIZE_ROWS_BODY:%.*]]
+; CHECK:   tiledpbssd.scalarize.rows.body:
+; CHECK-NEXT:br label [[TILEDPBSSD_SCALARIZE_COLS_HEADER:%.*]]
+; CHECK:   tiledpbssd.scalarize.cols.header:
+; CHECK-NEXT:[[TILEDPBSSD_SCALARIZE_COLS_IV:%.*]] = phi i16 [ 0, [[TILEDPBSSD_SCALARIZE_ROWS_BODY]] ], [ [[TILEDPBSSD_SCALARIZE_COLS_STEP:%.*]], [[TILEDPBSSD_SCALARIZE_COLS_LATCH:%.*]] ]
+; CHECK-NEXT:[[VEC_C_PHI_COL:%.*]] = phi <256 x i32> [ [[VEC_C_PHI_ROW]], [[TILEDPBSSD_SCALARIZE_ROWS_BODY]] ], [ [[TMP21]], [[TILEDPBSSD_SCALARIZE_COLS_LATCH]] ]
+; CHECK-NEXT:[[VEC_D_PHI_COL:%.*]] = phi <256 x i32> [ [[VEC_D_PHI_ROW]], [[TILEDPBSSD_SCALARIZE_ROWS_BODY]] ], [ [[TMP22]], [[TILEDPBSSD_SCALARIZE_COLS_LATCH]] ]
+; CHECK-NEXT:br label [[TILEDPBSSD_SCALARIZE_COLS_BODY:%.*]]
+; CHECK:   tiledpbssd.scalarize.cols.body:
+; CHECK-NEXT:br label [[TILEDPBSSD_SCALARIZE_INNER_HEADER:%.*]]
+; CHECK:   tiledpbssd.scalarize.inner.header:
+; CHECK-NEXT:[[TILEDPBSSD_SCALARIZE_INNER_IV:%.*]] = phi i16 [ 0, [[TILEDPBSSD_SCALARIZE_COLS_BODY]] ], [ [[TILEDPBSSD_SCALARIZE_INNER_STEP:%.*]], [[TILEDPBSSD_SCALARIZE_INNER_LATCH:%.*]] ]
+; CHECK-NEXT:[[VEC_C_INNER_PHI:%.*]] = phi <256 x i32> [ [[VEC_C_PHI_COL]], [[TILEDPBSSD_SCALARIZE_COLS_BODY]] ], [ [[TMP21]], [[TILEDPBSSD_SCALARIZE_INNER_LATCH]] ]
+; CHECK-NEXT:[[VEC_D_INNER_PHI:%.*]] = phi <256 x i32> [ [[VEC_D_PHI_COL]], [[TILEDPBSSD_SCALARIZE_COLS_BODY]] ], [ [[TMP22]], [[TILEDPBSSD_SCALARIZE_INNER_LATCH]] ]
+; CHECK-NEXT:br label [[TILEDPBSSD_SCALARIZE_INNER_BODY:%.*]]
+; CHECK:   tiledpbssd.scalarize.inner.body:
+; CHECK-NEXT:[[TMP2:%.*]] = mul i16 [[TILEDPBSSD_SCALARIZE_ROWS_IV]], 16
+; CHECK-NEXT:[[TMP3:%.*]] = add i16 [[TMP2]], [[TILEDPBSSD_SCALARIZE_COLS_IV]]
+; CHECK-NEXT:[[TMP4:%.*]] = mul i16 [[TILEDPBSSD_SCALARIZE_ROWS_IV]], 16
+; CHECK-NEXT:[[TMP5:%.*]] = add i16 [[TMP4]], [[TILEDPBSSD_SCALARIZE_INNER_IV]]
+; CHECK-NEXT:[[TMP6:%.*]] = mul i16 [[TILEDPBSSD_SCALARIZE_INNER_IV]], 16
+; CHECK-NEXT:[[TMP7:%.*]] = add i16 [[TMP6]], [[TILEDPBSSD_SCALARIZE_COLS_IV]]
+; CHECK-NEXT:[[TMP8:%.*]] = extractelement <256 x i32> [[VEC_C_INNER_PHI]], i16 [[TMP3]]
+; CHECK-NEXT:[[TMP9:%.*]] = bitcast i32 [[TMP8]] to float

[PATCH] D93594: [X86] Pass to transform amx intrinsics to scalar operation.

2021-02-22 Thread Bing Yu via Phabricator via cfe-commits
yubing marked 13 inline comments as done.
yubing added inline comments.



Comment at: llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp:487
+  SmallVector WorkList;
+  for (BasicBlock *BB : depth_first()) {
+for (BasicBlock::iterator II = BB->begin(), IE = BB->end(); II != IE;) {

LuoYuanke wrote:
> Do we iterate the instructions in topology order or in post order?
It should be pre-order since we need to handle cases without bitcasts, such as, 
amx-low-intrinsics-no-bitcast.ll


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D93594/new/

https://reviews.llvm.org/D93594

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D93594: [X86] Pass to transform amx intrinsics to scalar operation.

2021-02-22 Thread Bing Yu via Phabricator via cfe-commits
yubing updated this revision to Diff 325670.
yubing added a comment.

Address comments above


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D93594/new/

https://reviews.llvm.org/D93594

Files:
  llvm/include/llvm/CodeGen/Passes.h
  llvm/lib/Target/X86/CMakeLists.txt
  llvm/lib/Target/X86/X86.h
  llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
  llvm/lib/Target/X86/X86LowerAMXType.cpp
  llvm/lib/Target/X86/X86TargetMachine.cpp
  llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-bitcast.ll
  llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
  llvm/test/CodeGen/X86/AMX/amx-type.ll
  llvm/test/CodeGen/X86/O0-pipeline.ll
  llvm/tools/opt/opt.cpp

Index: llvm/tools/opt/opt.cpp
===
--- llvm/tools/opt/opt.cpp
+++ llvm/tools/opt/opt.cpp
@@ -497,7 +497,7 @@
   "expand-reductions","indirectbr-expand",
   "generic-to-nvvm",  "expandmemcmp",
   "loop-reduce",  "lower-amx-type",
-  "polyhedral-info"};
+  "lower-amx-intrinsics", "polyhedral-info"};
   for (const auto  : PassNamePrefix)
 if (Pass.startswith(P))
   return true;
Index: llvm/test/CodeGen/X86/O0-pipeline.ll
===
--- llvm/test/CodeGen/X86/O0-pipeline.ll
+++ llvm/test/CodeGen/X86/O0-pipeline.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; When EXPENSIVE_CHECKS are enabled, the machine verifier appears between each
 ; pass. Ignore it with 'grep -v'.
 ; RUN: llc -mtriple=x86_64-- -O0 -debug-pass=Structure < %s -o /dev/null 2>&1 \
@@ -18,6 +19,9 @@
 ; CHECK-NEXT: Pre-ISel Intrinsic Lowering
 ; CHECK-NEXT: FunctionPass Manager
 ; CHECK-NEXT:   Expand Atomic instructions
+; CHECK-NEXT:   Dominator Tree Construction
+; CHECK-NEXT:   Natural Loop Information
+; CHECK-NEXT:   Lower AMX intrinsics
 ; CHECK-NEXT:   Lower AMX type for load/store
 ; CHECK-NEXT:   Module Verifier
 ; CHECK-NEXT:   Lower Garbage Collection Instructions
Index: llvm/test/CodeGen/X86/AMX/amx-type.ll
===
--- llvm/test/CodeGen/X86/AMX/amx-type.ll
+++ llvm/test/CodeGen/X86/AMX/amx-type.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -lower-amx-type %s -S | FileCheck %s
+; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -lower-amx-type %s -S | FileCheck %s
 
 %struct.__tile_str = type { i16, i16, <256 x i32> }
 
Index: llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
===
--- /dev/null
+++ llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
@@ -0,0 +1,237 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mtriple=x86_64 -lower-amx-intrinsics %s -S | FileCheck %s
+
+define dso_local void @test_amx_load_non_O0(i16 signext %row, i16 signext %col, i8 *%ptr, i64 %stride, <256 x i32>* %vptr) {
+; CHECK-LABEL: @test_amx_load_non_O0(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:[[TMP0:%.*]] = lshr i16 [[COL:%.*]], 2
+; CHECK-NEXT:[[TMP1:%.*]] = lshr i64 [[STRIDE:%.*]], 2
+; CHECK-NEXT:br label [[TILELOAD_SCALARIZE_ROWS_HEADER:%.*]]
+; CHECK:   tileload.scalarize.rows.header:
+; CHECK-NEXT:[[TILELOAD_SCALARIZE_ROWS_IV:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TILELOAD_SCALARIZE_ROWS_STEP:%.*]], [[TILELOAD_SCALARIZE_ROWS_LATCH:%.*]] ]
+; CHECK-NEXT:[[VEC_PHI_ROW:%.*]] = phi <256 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[TMP11:%.*]], [[TILELOAD_SCALARIZE_ROWS_LATCH]] ]
+; CHECK-NEXT:br label [[TILELOAD_SCALARIZE_ROWS_BODY:%.*]]
+; CHECK:   tileload.scalarize.rows.body:
+; CHECK-NEXT:br label [[TILELOAD_SCALARIZE_COLS_HEADER:%.*]]
+; CHECK:   tileload.scalarize.cols.header:
+; CHECK-NEXT:[[TILELOAD_SCALARIZE_COLS_IV:%.*]] = phi i16 [ 0, [[TILELOAD_SCALARIZE_ROWS_BODY]] ], [ [[TILELOAD_SCALARIZE_COLS_STEP:%.*]], [[TILELOAD_SCALARIZE_COLS_LATCH:%.*]] ]
+; CHECK-NEXT:[[VEC_PHI:%.*]] = phi <256 x i32> [ [[VEC_PHI_ROW]], [[TILELOAD_SCALARIZE_ROWS_BODY]] ], [ [[TMP11]], [[TILELOAD_SCALARIZE_COLS_LATCH]] ]
+; CHECK-NEXT:br label [[TILELOAD_SCALARIZE_COLS_BODY:%.*]]
+; CHECK:   tileload.scalarize.cols.body:
+; CHECK-NEXT:[[TMP2:%.*]] = zext i16 [[TILELOAD_SCALARIZE_ROWS_IV]] to i64
+; CHECK-NEXT:[[TMP3:%.*]] = zext i16 [[TILELOAD_SCALARIZE_COLS_IV]] to i64
+; CHECK-NEXT:[[TMP4:%.*]] = mul i64 [[TMP2]], [[TMP1]]
+; CHECK-NEXT:[[TMP5:%.*]] = add i64 [[TMP4]], [[TMP3]]
+; CHECK-NEXT:[[TMP6:%.*]] = bitcast i8* [[PTR:%.*]] to i32*
+; CHECK-NEXT:[[TMP7:%.*]] = getelementptr i32, i32* [[TMP6]], i64 [[TMP5]]
+; CHECK-NEXT:[[TMP8:%.*]] = mul i16 [[TILELOAD_SCALARIZE_ROWS_IV]], 16
+; CHECK-NEXT:[[TMP9:%.*]] = add i16 [[TMP8]], [[TILELOAD_SCALARIZE_COLS_IV]]
+; CHECK-NEXT:[[TMP10:%.*]] = load i32, i32* [[TMP7]], align 4
+; CHECK-NEXT:

[PATCH] D96110: [X86] Pass to transform tdpbf16ps intrinsics to scalar operation.

2021-02-20 Thread Bing Yu via Phabricator via cfe-commits
yubing updated this revision to Diff 325170.
yubing edited the summary of this revision.
yubing added a comment.

Address comments above and refactor some code


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D96110/new/

https://reviews.llvm.org/D96110

Files:
  clang/include/clang/Basic/BuiltinsX86_64.def
  clang/lib/Headers/amxintrin.h
  llvm/include/llvm/IR/IntrinsicsX86.td
  llvm/lib/Target/X86/X86ExpandPseudo.cpp
  llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
  llvm/lib/Target/X86/X86InstrAMX.td
  llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
  llvm/lib/Target/X86/X86LowerAMXType.cpp
  llvm/lib/Target/X86/X86PreTileConfig.cpp
  llvm/lib/Target/X86/X86RegisterInfo.cpp

Index: llvm/lib/Target/X86/X86RegisterInfo.cpp
===
--- llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -878,6 +878,7 @@
   // We only collect the tile shape that is defined.
   case X86::PTILELOADDV:
   case X86::PTDPBSSDV:
+  case X86::PTDPBF16PSV:
   case X86::PTILEZEROV:
 MachineOperand  = MI->getOperand(1);
 MachineOperand  = MI->getOperand(2);
Index: llvm/lib/Target/X86/X86PreTileConfig.cpp
===
--- llvm/lib/Target/X86/X86PreTileConfig.cpp
+++ llvm/lib/Target/X86/X86PreTileConfig.cpp
@@ -127,6 +127,7 @@
 llvm_unreachable("Unexpected machine instruction on tile");
   case X86::PTILELOADDV:
   case X86::PTDPBSSDV:
+  case X86::PTDPBF16PSV:
   case X86::PTILEZEROV:
 MachineOperand  = const_cast(MI.getOperand(1));
 MachineOperand  = const_cast(MI.getOperand(2));
@@ -221,6 +222,7 @@
   case X86::PTILELOADDV:
   case X86::PTILESTOREDV:
   case X86::PTDPBSSDV:
+  case X86::PTDPBF16PSV:
   case X86::PTILEZEROV:
 return true;
   }
Index: llvm/lib/Target/X86/X86LowerAMXType.cpp
===
--- llvm/lib/Target/X86/X86LowerAMXType.cpp
+++ llvm/lib/Target/X86/X86LowerAMXType.cpp
@@ -69,7 +69,8 @@
   }
   // a * b + c
   // The shape depends on which operand.
-  case Intrinsic::x86_tdpbssd_internal: {
+  case Intrinsic::x86_tdpbssd_internal:
+  case Intrinsic::x86_tdpbf16ps_internal: {
 switch (OpNo) {
 case 3:
   Row = II->getArgOperand(0);
Index: llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
===
--- llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
+++ llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
@@ -22,7 +22,6 @@
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/Passes.h"
-
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/DataLayout.h"
@@ -209,11 +208,11 @@
   B.CreateStore(Elt, EltPtr);
 }
 
-static Value *createTileDPBSSDLoops(BasicBlock *Start, BasicBlock *End,
-IRBuilderBase , DomTreeUpdater ,
-LoopInfo , Value *Row, Value *Col,
-Value *K, Value *Acc, Value *LHS,
-Value *RHS) {
+template 
+static Value *createTileDPLoops(BasicBlock *Start, BasicBlock *End,
+IRBuilderBase , DomTreeUpdater ,
+LoopInfo , Value *Row, Value *Col, Value *K,
+Value *Acc, Value *LHS, Value *RHS) {
   Loop *RowLoop = LI.AllocateLoop();
   Loop *ColLoop = LI.AllocateLoop();
   Loop *InnerLoop = LI.AllocateLoop();
@@ -321,17 +320,40 @@
   B.CreateAdd(B.CreateMul(CurrentRow, B.getInt16(16)), CurrentInner);
   Value *IdxB =
   B.CreateAdd(B.CreateMul(CurrentInner, B.getInt16(16)), CurrentCol);
-
-  FixedVectorType *V4I8Ty = FixedVectorType::get(B.getInt8Ty(), 4);
-  FixedVectorType *V4I32Ty = FixedVectorType::get(B.getInt32Ty(), 4);
-  Value *EltC = B.CreateExtractElement(VecCPhi, IdxC);
-  Value *EltA = B.CreateExtractElement(VecA, IdxA);
-  Value *SubVecA = B.CreateBitCast(EltA, V4I8Ty);
-  Value *EltB = B.CreateExtractElement(VecB, IdxB);
-  Value *SubVecB = B.CreateBitCast(EltB, V4I8Ty);
-  Value *SubVecR = B.CreateAddReduce(B.CreateMul(
-  B.CreateSExt(SubVecA, V4I32Ty), B.CreateSExt(SubVecB, V4I32Ty)));
-  Value *ResElt = B.CreateAdd(EltC, SubVecR);
+  Value *ResElt = nullptr;
+  if (IntrID == Intrinsic::x86_tdpbssd_internal) {
+FixedVectorType *V4I8Ty = FixedVectorType::get(B.getInt8Ty(), 4);
+FixedVectorType *V4I32Ty = FixedVectorType::get(B.getInt32Ty(), 4);
+Value *EltC = B.CreateExtractElement(VecCPhi, IdxC);
+Value *EltA = B.CreateExtractElement(VecA, IdxA);
+Value *SubVecA = B.CreateBitCast(EltA, V4I8Ty);
+Value *EltB = B.CreateExtractElement(VecB, IdxB);
+Value *SubVecB = B.CreateBitCast(EltB, V4I8Ty);
+Value *SubVecR = B.CreateAddReduce(B.CreateMul(
+B.CreateSExt(SubVecA, V4I32Ty), 

[PATCH] D93594: [X86] Pass to transform amx intrinsics to scalar operation.

2021-02-19 Thread Bing Yu via Phabricator via cfe-commits
yubing updated this revision to Diff 325155.
yubing marked 5 inline comments as done.
yubing added a comment.

Small fix for some code


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D93594/new/

https://reviews.llvm.org/D93594

Files:
  llvm/include/llvm/CodeGen/Passes.h
  llvm/lib/Target/X86/CMakeLists.txt
  llvm/lib/Target/X86/X86.h
  llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
  llvm/lib/Target/X86/X86LowerAMXType.cpp
  llvm/lib/Target/X86/X86TargetMachine.cpp
  llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-bitcast.ll
  llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
  llvm/test/CodeGen/X86/AMX/amx-type.ll
  llvm/test/CodeGen/X86/O0-pipeline.ll

Index: llvm/test/CodeGen/X86/O0-pipeline.ll
===
--- llvm/test/CodeGen/X86/O0-pipeline.ll
+++ llvm/test/CodeGen/X86/O0-pipeline.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; When EXPENSIVE_CHECKS are enabled, the machine verifier appears between each
 ; pass. Ignore it with 'grep -v'.
 ; RUN: llc -mtriple=x86_64-- -O0 -debug-pass=Structure < %s -o /dev/null 2>&1 \
@@ -18,6 +19,9 @@
 ; CHECK-NEXT: Pre-ISel Intrinsic Lowering
 ; CHECK-NEXT: FunctionPass Manager
 ; CHECK-NEXT:   Expand Atomic instructions
+; CHECK-NEXT:   Dominator Tree Construction
+; CHECK-NEXT:   Natural Loop Information
+; CHECK-NEXT:   Lower AMX intrinsics
 ; CHECK-NEXT:   Lower AMX type for load/store
 ; CHECK-NEXT:   Module Verifier
 ; CHECK-NEXT:   Lower Garbage Collection Instructions
Index: llvm/test/CodeGen/X86/AMX/amx-type.ll
===
--- llvm/test/CodeGen/X86/AMX/amx-type.ll
+++ llvm/test/CodeGen/X86/AMX/amx-type.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -lower-amx-type %s -S | FileCheck %s
+; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -lower-amx-type %s -S | FileCheck %s
 
 %struct.__tile_str = type { i16, i16, <256 x i32> }
 
Index: llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
===
--- /dev/null
+++ llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
@@ -0,0 +1,237 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mtriple=x86_64 -lower-amx-intrinsics %s -S | FileCheck %s
+
+define dso_local void @test_amx_load_non_O0(i16 signext %row, i16 signext %col, i8 *%ptr, i64 %stride, <256 x i32>* %vptr) {
+; CHECK-LABEL: @test_amx_load_non_O0(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:[[TMP0:%.*]] = udiv i16 [[COL:%.*]], 4
+; CHECK-NEXT:[[TMP1:%.*]] = udiv i64 [[STRIDE:%.*]], 4
+; CHECK-NEXT:br label [[TILELOAD_SCALARIZE_ROWS_HEADER:%.*]]
+; CHECK:   tileload.scalarize.rows.header:
+; CHECK-NEXT:[[TILELOAD_SCALARIZE_ROWS_IV:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TILELOAD_SCALARIZE_ROWS_STEP:%.*]], [[TILELOAD_SCALARIZE_ROWS_LATCH:%.*]] ]
+; CHECK-NEXT:[[VEC_PHI_ROW:%.*]] = phi <256 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[TMP11:%.*]], [[TILELOAD_SCALARIZE_ROWS_LATCH]] ]
+; CHECK-NEXT:br label [[TILELOAD_SCALARIZE_ROWS_BODY:%.*]]
+; CHECK:   tileload.scalarize.rows.body:
+; CHECK-NEXT:br label [[TILELOAD_SCALARIZE_COLS_HEADER:%.*]]
+; CHECK:   tileload.scalarize.cols.header:
+; CHECK-NEXT:[[TILELOAD_SCALARIZE_COLS_IV:%.*]] = phi i16 [ 0, [[TILELOAD_SCALARIZE_ROWS_BODY]] ], [ [[TILELOAD_SCALARIZE_COLS_STEP:%.*]], [[TILELOAD_SCALARIZE_COLS_LATCH:%.*]] ]
+; CHECK-NEXT:[[VEC_PHI:%.*]] = phi <256 x i32> [ [[VEC_PHI_ROW]], [[TILELOAD_SCALARIZE_ROWS_BODY]] ], [ [[TMP11]], [[TILELOAD_SCALARIZE_COLS_LATCH]] ]
+; CHECK-NEXT:br label [[TILELOAD_SCALARIZE_COLS_BODY:%.*]]
+; CHECK:   tileload.scalarize.cols.body:
+; CHECK-NEXT:[[TMP2:%.*]] = zext i16 [[TILELOAD_SCALARIZE_ROWS_IV]] to i64
+; CHECK-NEXT:[[TMP3:%.*]] = zext i16 [[TILELOAD_SCALARIZE_COLS_IV]] to i64
+; CHECK-NEXT:[[TMP4:%.*]] = mul i64 [[TMP2]], [[TMP1]]
+; CHECK-NEXT:[[TMP5:%.*]] = add i64 [[TMP4]], [[TMP3]]
+; CHECK-NEXT:[[TMP6:%.*]] = bitcast i8* [[PTR:%.*]] to i32*
+; CHECK-NEXT:[[TMP7:%.*]] = getelementptr i32, i32* [[TMP6]], i64 [[TMP5]]
+; CHECK-NEXT:[[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4
+; CHECK-NEXT:[[TMP9:%.*]] = mul i16 [[TILELOAD_SCALARIZE_ROWS_IV]], 16
+; CHECK-NEXT:[[TMP10:%.*]] = add i16 [[TMP9]], [[TILELOAD_SCALARIZE_COLS_IV]]
+; CHECK-NEXT:[[TMP11]] = insertelement <256 x i32> [[VEC_PHI]], i32 [[TMP8]], i16 [[TMP10]]
+; CHECK-NEXT:br label [[TILELOAD_SCALARIZE_COLS_LATCH]]
+; CHECK:   tileload.scalarize.cols.latch:
+; CHECK-NEXT:[[TILELOAD_SCALARIZE_COLS_STEP]] = add i16 [[TILELOAD_SCALARIZE_COLS_IV]], 1
+; CHECK-NEXT:[[TILELOAD_SCALARIZE_COLS_COND:%.*]] = icmp ne i16 [[TILELOAD_SCALARIZE_COLS_STEP]], [[TMP0]]
+; CHECK-NEXT:br i1 [[TILELOAD_SCALARIZE_COLS_COND]], label 

[PATCH] D93594: [X86] Pass to transform amx intrinsics to scalar operation.

2021-02-19 Thread Bing Yu via Phabricator via cfe-commits
yubing updated this revision to Diff 325152.
yubing added a comment.

Address the commments above.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D93594/new/

https://reviews.llvm.org/D93594

Files:
  llvm/include/llvm/CodeGen/Passes.h
  llvm/lib/Target/X86/CMakeLists.txt
  llvm/lib/Target/X86/X86.h
  llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
  llvm/lib/Target/X86/X86LowerAMXType.cpp
  llvm/lib/Target/X86/X86TargetMachine.cpp
  llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-bitcast.ll
  llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
  llvm/test/CodeGen/X86/AMX/amx-type.ll
  llvm/test/CodeGen/X86/O0-pipeline.ll

Index: llvm/test/CodeGen/X86/O0-pipeline.ll
===
--- llvm/test/CodeGen/X86/O0-pipeline.ll
+++ llvm/test/CodeGen/X86/O0-pipeline.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; When EXPENSIVE_CHECKS are enabled, the machine verifier appears between each
 ; pass. Ignore it with 'grep -v'.
 ; RUN: llc -mtriple=x86_64-- -O0 -debug-pass=Structure < %s -o /dev/null 2>&1 \
@@ -18,6 +19,9 @@
 ; CHECK-NEXT: Pre-ISel Intrinsic Lowering
 ; CHECK-NEXT: FunctionPass Manager
 ; CHECK-NEXT:   Expand Atomic instructions
+; CHECK-NEXT:   Dominator Tree Construction
+; CHECK-NEXT:   Natural Loop Information
+; CHECK-NEXT:   Lower AMX intrinsics
 ; CHECK-NEXT:   Lower AMX type for load/store
 ; CHECK-NEXT:   Module Verifier
 ; CHECK-NEXT:   Lower Garbage Collection Instructions
Index: llvm/test/CodeGen/X86/AMX/amx-type.ll
===
--- llvm/test/CodeGen/X86/AMX/amx-type.ll
+++ llvm/test/CodeGen/X86/AMX/amx-type.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -lower-amx-type %s -S | FileCheck %s
+; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -lower-amx-type %s -S | FileCheck %s
 
 %struct.__tile_str = type { i16, i16, <256 x i32> }
 
Index: llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
===
--- /dev/null
+++ llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
@@ -0,0 +1,237 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mtriple=x86_64 -lower-amx-intrinsics %s -S | FileCheck %s
+
+define dso_local void @test_amx_load_non_O0(i16 signext %row, i16 signext %col, i8 *%ptr, i64 %stride, <256 x i32>* %vptr) {
+; CHECK-LABEL: @test_amx_load_non_O0(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:[[TMP0:%.*]] = udiv i16 [[COL:%.*]], 4
+; CHECK-NEXT:[[TMP1:%.*]] = udiv i64 [[STRIDE:%.*]], 4
+; CHECK-NEXT:br label [[TILELOAD_SCALARIZE_ROWS_HEADER:%.*]]
+; CHECK:   tileload.scalarize.rows.header:
+; CHECK-NEXT:[[TILELOAD_SCALARIZE_ROWS_IV:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TILELOAD_SCALARIZE_ROWS_STEP:%.*]], [[TILELOAD_SCALARIZE_ROWS_LATCH:%.*]] ]
+; CHECK-NEXT:[[VEC_PHI_ROW:%.*]] = phi <256 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[TMP11:%.*]], [[TILELOAD_SCALARIZE_ROWS_LATCH]] ]
+; CHECK-NEXT:br label [[TILELOAD_SCALARIZE_ROWS_BODY:%.*]]
+; CHECK:   tileload.scalarize.rows.body:
+; CHECK-NEXT:br label [[TILELOAD_SCALARIZE_COLS_HEADER:%.*]]
+; CHECK:   tileload.scalarize.cols.header:
+; CHECK-NEXT:[[TILELOAD_SCALARIZE_COLS_IV:%.*]] = phi i16 [ 0, [[TILELOAD_SCALARIZE_ROWS_BODY]] ], [ [[TILELOAD_SCALARIZE_COLS_STEP:%.*]], [[TILELOAD_SCALARIZE_COLS_LATCH:%.*]] ]
+; CHECK-NEXT:[[VEC_PHI:%.*]] = phi <256 x i32> [ [[VEC_PHI_ROW]], [[TILELOAD_SCALARIZE_ROWS_BODY]] ], [ [[TMP11]], [[TILELOAD_SCALARIZE_COLS_LATCH]] ]
+; CHECK-NEXT:br label [[TILELOAD_SCALARIZE_COLS_BODY:%.*]]
+; CHECK:   tileload.scalarize.cols.body:
+; CHECK-NEXT:[[TMP2:%.*]] = zext i16 [[TILELOAD_SCALARIZE_ROWS_IV]] to i64
+; CHECK-NEXT:[[TMP3:%.*]] = zext i16 [[TILELOAD_SCALARIZE_COLS_IV]] to i64
+; CHECK-NEXT:[[TMP4:%.*]] = mul i64 [[TMP2]], [[TMP1]]
+; CHECK-NEXT:[[TMP5:%.*]] = add i64 [[TMP4]], [[TMP3]]
+; CHECK-NEXT:[[TMP6:%.*]] = bitcast i8* [[PTR:%.*]] to i32*
+; CHECK-NEXT:[[TMP7:%.*]] = getelementptr i32, i32* [[TMP6]], i64 [[TMP5]]
+; CHECK-NEXT:[[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4
+; CHECK-NEXT:[[TMP9:%.*]] = mul i16 [[TILELOAD_SCALARIZE_ROWS_IV]], 16
+; CHECK-NEXT:[[TMP10:%.*]] = add i16 [[TMP9]], [[TILELOAD_SCALARIZE_COLS_IV]]
+; CHECK-NEXT:[[TMP11]] = insertelement <256 x i32> [[VEC_PHI]], i32 [[TMP8]], i16 [[TMP10]]
+; CHECK-NEXT:br label [[TILELOAD_SCALARIZE_COLS_LATCH]]
+; CHECK:   tileload.scalarize.cols.latch:
+; CHECK-NEXT:[[TILELOAD_SCALARIZE_COLS_STEP]] = add i16 [[TILELOAD_SCALARIZE_COLS_IV]], 1
+; CHECK-NEXT:[[TILELOAD_SCALARIZE_COLS_COND:%.*]] = icmp ne i16 [[TILELOAD_SCALARIZE_COLS_STEP]], [[TMP0]]
+; CHECK-NEXT:br i1 [[TILELOAD_SCALARIZE_COLS_COND]], label [[TILELOAD_SCALARIZE_COLS_HEADER]], label 

[PATCH] D93594: [X86] Pass to transform amx intrinsics to scalar operation.

2021-02-19 Thread Bing Yu via Phabricator via cfe-commits
yubing added inline comments.



Comment at: llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp:211-212
+IRBuilderBase , DomTreeUpdater ,
+LoopInfo , Value *Row, Value *Col,
+Value *K, Value *Acc, Value *LHS,
+Value *RHS) {

xiangzhangllvm wrote:
> In fact, no need handle Row, Col, K here, just use fix size 16x16, the result 
> of calculation is some in effective area. (just need tileload "keep" the 
> "unused" area is 0). 
> Then can use vector to handle all of the them, let type legalization to split 
> the type.
We should keep the code here. In bf16,  since +0.0(0x) * negative float is 
equal to -0.0(0x8000), following your solution is not able to ensure outer edge 
is allzero.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D93594/new/

https://reviews.llvm.org/D93594

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D93594: [X86] Pass to transform amx intrinsics to scalar operation.

2021-02-08 Thread Bing Yu via Phabricator via cfe-commits
yubing added inline comments.



Comment at: llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll:77
+; CHECK-NEXT:[[TILEDPBSSD_UNROLL_ROWS_IV:%.*]] = phi i16 [ 0, 
[[ENTRY:%.*]] ], [ [[TILEDPBSSD_UNROLL_ROWS_STEP:%.*]], 
[[TILEDPBSSD_UNROLL_ROWS_LATCH:%.*]] ]
+; CHECK-NEXT:[[VEC_PHI_ROW:%.*]] = phi <256 x i32> [ [[C]], [[ENTRY]] ], [ 
[[TMP18:%.*]], [[TILEDPBSSD_UNROLL_ROWS_LATCH]] ]
+; CHECK-NEXT:br label [[TILEDPBSSD_UNROLL_ROWS_BODY:%.*]]

Sorry, there is a bug here. According to AMX's spec, dst's remaining part 
should be all zero.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D93594/new/

https://reviews.llvm.org/D93594

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D93594: [X86] Pass to transform amx intrinsics to scalar operation.

2021-02-05 Thread Bing Yu via Phabricator via cfe-commits
yubing added a comment.

Strange. llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll can pass in my local 
machine.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D93594/new/

https://reviews.llvm.org/D93594

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D96110: [X86] Pass to transform tdpbf16ps intrinsics to scalar operation.

2021-02-05 Thread Bing Yu via Phabricator via cfe-commits
yubing updated this revision to Diff 321675.
yubing added a comment.

Rebase and add a testcase for dpbf16ps intrinsic.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D96110/new/

https://reviews.llvm.org/D96110

Files:
  clang/include/clang/Basic/BuiltinsX86_64.def
  clang/lib/Headers/amxintrin.h
  llvm/include/llvm/IR/IntrinsicsX86.td
  llvm/lib/Target/X86/X86ExpandPseudo.cpp
  llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
  llvm/lib/Target/X86/X86InstrAMX.td
  llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
  llvm/lib/Target/X86/X86LowerAMXType.cpp
  llvm/lib/Target/X86/X86PreTileConfig.cpp
  llvm/lib/Target/X86/X86RegisterInfo.cpp
  llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll

Index: llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
===
--- llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
+++ llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
@@ -63,8 +63,8 @@
   ret void
 }
 
-define dso_local void @test_amx_dp(i16 signext %row, i16 signext %col, i16 signext %k, <256 x i32> %c, <256 x i32> %a, <256 x i32> %b, <256 x i32>* %vptr) #0 {
-; CHECK-LABEL: @test_amx_dp(
+define dso_local void @test_amx_dpbssd(i16 signext %row, i16 signext %col, i16 signext %k, <256 x i32> %c, <256 x i32> %a, <256 x i32> %b, <256 x i32>* %vptr) #0 {
+; CHECK-LABEL: @test_amx_dpbssd(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:[[A_AMX:%.*]] = bitcast <256 x i32> [[A:%.*]] to x86_amx
 ; CHECK-NEXT:[[B_AMX:%.*]] = bitcast <256 x i32> [[B:%.*]] to x86_amx
@@ -133,6 +133,81 @@
   ret void
 }
 
+define dso_local void @test_amx_dpbf16ps(i16 signext %row, i16 signext %col, i16 signext %k, <256 x i32> %c, <256 x i32> %a, <256 x i32> %b, <256 x i32>* %vptr) #0 {
+; CHECK-LABEL: @test_amx_dpbf16ps(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:[[A_AMX:%.*]] = bitcast <256 x i32> [[A:%.*]] to x86_amx
+; CHECK-NEXT:[[B_AMX:%.*]] = bitcast <256 x i32> [[B:%.*]] to x86_amx
+; CHECK-NEXT:[[C_AMX:%.*]] = bitcast <256 x i32> [[C:%.*]] to x86_amx
+; CHECK-NEXT:[[TMP0:%.*]] = udiv i16 [[COL:%.*]], 4
+; CHECK-NEXT:[[TMP1:%.*]] = udiv i16 [[K:%.*]], 4
+; CHECK-NEXT:br label [[TILEDPBF16PS_UNROLL_ROWS_HEADER:%.*]]
+; CHECK:   tiledpbf16ps.unroll.rows.header:
+; CHECK-NEXT:[[TILEDPBF16PS_UNROLL_ROWS_IV:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TILEDPBF16PS_UNROLL_ROWS_STEP:%.*]], [[TILEDPBF16PS_UNROLL_ROWS_LATCH:%.*]] ]
+; CHECK-NEXT:[[VEC_PHI_ROW:%.*]] = phi <256 x i32> [ [[C]], [[ENTRY]] ], [ [[TMP23:%.*]], [[TILEDPBF16PS_UNROLL_ROWS_LATCH]] ]
+; CHECK-NEXT:br label [[TILEDPBF16PS_UNROLL_ROWS_BODY:%.*]]
+; CHECK:   tiledpbf16ps.unroll.rows.body:
+; CHECK-NEXT:br label [[TILEDPBF16PS_UNROLL_COLS_HEADER:%.*]]
+; CHECK:   tiledpbf16ps.unroll.cols.header:
+; CHECK-NEXT:[[TILEDPBF16PS_UNROLL_COLS_IV:%.*]] = phi i16 [ 0, [[TILEDPBF16PS_UNROLL_ROWS_BODY]] ], [ [[TILEDPBF16PS_UNROLL_COLS_STEP:%.*]], [[TILEDPBF16PS_UNROLL_COLS_LATCH:%.*]] ]
+; CHECK-NEXT:[[VEC_PHI_COL:%.*]] = phi <256 x i32> [ [[VEC_PHI_ROW]], [[TILEDPBF16PS_UNROLL_ROWS_BODY]] ], [ [[TMP23]], [[TILEDPBF16PS_UNROLL_COLS_LATCH]] ]
+; CHECK-NEXT:br label [[TILEDPBF16PS_UNROLL_COLS_BODY:%.*]]
+; CHECK:   tiledpbf16ps.unroll.cols.body:
+; CHECK-NEXT:br label [[TILEDPBF16PS_UNROLL_INNER_HEADER:%.*]]
+; CHECK:   tiledpbf16ps.unroll.inner.header:
+; CHECK-NEXT:[[TILEDPBF16PS_UNROLL_INNER_IV:%.*]] = phi i16 [ 0, [[TILEDPBF16PS_UNROLL_COLS_BODY]] ], [ [[TILEDPBF16PS_UNROLL_INNER_STEP:%.*]], [[TILEDPBF16PS_UNROLL_INNER_LATCH:%.*]] ]
+; CHECK-NEXT:[[VEC_PHI:%.*]] = phi <256 x i32> [ [[VEC_PHI_COL]], [[TILEDPBF16PS_UNROLL_COLS_BODY]] ], [ [[TMP23]], [[TILEDPBF16PS_UNROLL_INNER_LATCH]] ]
+; CHECK-NEXT:br label [[TILEDPBF16PS_UNROLL_INNER_BODY:%.*]]
+; CHECK:   tiledpbf16ps.unroll.inner.body:
+; CHECK-NEXT:[[TMP2:%.*]] = mul i16 [[TILEDPBF16PS_UNROLL_ROWS_IV]], 16
+; CHECK-NEXT:[[TMP3:%.*]] = add i16 [[TMP2]], [[TILEDPBF16PS_UNROLL_COLS_IV]]
+; CHECK-NEXT:[[TMP4:%.*]] = mul i16 [[TILEDPBF16PS_UNROLL_ROWS_IV]], 16
+; CHECK-NEXT:[[TMP5:%.*]] = add i16 [[TMP4]], [[TILEDPBF16PS_UNROLL_INNER_IV]]
+; CHECK-NEXT:[[TMP6:%.*]] = mul i16 [[TILEDPBF16PS_UNROLL_INNER_IV]], 16
+; CHECK-NEXT:[[TMP7:%.*]] = add i16 [[TMP6]], [[TILEDPBF16PS_UNROLL_COLS_IV]]
+; CHECK-NEXT:[[TMP8:%.*]] = extractelement <256 x i32> [[VEC_PHI]], i16 [[TMP3]]
+; CHECK-NEXT:[[TMP9:%.*]] = bitcast i32 [[TMP8]] to float
+; CHECK-NEXT:[[TMP10:%.*]] = extractelement <256 x i32> [[A]], i16 [[TMP5]]
+; CHECK-NEXT:[[TMP11:%.*]] = bitcast i32 [[TMP10]] to <2 x i16>
+; CHECK-NEXT:[[TMP12:%.*]] = extractelement <256 x i32> [[B]], i16 [[TMP7]]
+; CHECK-NEXT:[[TMP13:%.*]] = bitcast i32 [[TMP12]] to <2 x i16>
+; CHECK-NEXT:[[TMP14:%.*]] = zext <2 x i16> [[TMP11]] to <2 x i32>
+; CHECK-NEXT:[[TMP15:%.*]] = shl <2 x i32> [[TMP14]], 
+; CHECK-NEXT:[[TMP16:%.*]] = bitcast <2 x i32> [[TMP15]] to <2 x float>
+; 

[PATCH] D93594: [X86] Pass to transform amx intrinsics to scalar operation.

2021-02-05 Thread Bing Yu via Phabricator via cfe-commits
yubing updated this revision to Diff 321673.
yubing added a comment.

Rebase and fix the bug in amx_api.c


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D93594/new/

https://reviews.llvm.org/D93594

Files:
  llvm/include/llvm/CodeGen/Passes.h
  llvm/lib/Target/X86/CMakeLists.txt
  llvm/lib/Target/X86/X86.h
  llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
  llvm/lib/Target/X86/X86TargetMachine.cpp
  llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
  llvm/test/CodeGen/X86/O0-pipeline.ll

Index: llvm/test/CodeGen/X86/O0-pipeline.ll
===
--- llvm/test/CodeGen/X86/O0-pipeline.ll
+++ llvm/test/CodeGen/X86/O0-pipeline.ll
@@ -18,7 +18,9 @@
 ; CHECK-NEXT: Pre-ISel Intrinsic Lowering
 ; CHECK-NEXT: FunctionPass Manager
 ; CHECK-NEXT:   Expand Atomic instructions
-; CHECK-NEXT:   Lower AMX type for load/store
+; CHECK-NEXT:   Dominator Tree Construction
+; CHECK-NEXT:   Natural Loop Information
+; CHECK-NEXT:   Lower AMX intrinsics
 ; CHECK-NEXT:   Module Verifier
 ; CHECK-NEXT:   Lower Garbage Collection Instructions
 ; CHECK-NEXT:   Shadow Stack GC Lowering
Index: llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
===
--- /dev/null
+++ llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
@@ -0,0 +1,198 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -lower-amx-intrinsics %s -S | FileCheck %s
+
+define dso_local void @test_amx_load_non_O0(i16 signext %row, i16 signext %col, i8 *%ptr, i64 %stride, <256 x i32>* %vptr) {
+; CHECK-LABEL: @test_amx_load_non_O0(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:[[AMX:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[ROW:%.*]], i16 [[COL:%.*]], i8* [[PTR:%.*]], i64 [[STRIDE:%.*]])
+; CHECK-NEXT:[[VEC:%.*]] = bitcast x86_amx [[AMX]] to <256 x i32>
+; CHECK-NEXT:store <256 x i32> [[VEC]], <256 x i32>* [[VPTR:%.*]], align 64
+; CHECK-NEXT:ret void
+;
+entry:
+  %amx = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* %ptr, i64 %stride)
+  %vec = bitcast x86_amx %amx to <256 x i32>
+  store <256 x i32> %vec, <256 x i32>* %vptr, align 64
+  ret void
+}
+
+define dso_local void @test_amx_load(i16 signext %row, i16 signext %col, i8 *%ptr, i64 %stride, <256 x i32>* %vptr) #0 {
+; CHECK-LABEL: @test_amx_load(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:[[TMP0:%.*]] = udiv i16 [[COL:%.*]], 4
+; CHECK-NEXT:[[TMP1:%.*]] = udiv i64 [[STRIDE:%.*]], 4
+; CHECK-NEXT:br label [[TILELOAD_UNROLL_ROWS_HEADER:%.*]]
+; CHECK:   tileload.unroll.rows.header:
+; CHECK-NEXT:[[TILELOAD_UNROLL_ROWS_IV:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TILELOAD_UNROLL_ROWS_STEP:%.*]], [[TILELOAD_UNROLL_ROWS_LATCH:%.*]] ]
+; CHECK-NEXT:[[VEC_PHI_ROW:%.*]] = phi <256 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[TMP11:%.*]], [[TILELOAD_UNROLL_ROWS_LATCH]] ]
+; CHECK-NEXT:br label [[TILELOAD_UNROLL_ROWS_BODY:%.*]]
+; CHECK:   tileload.unroll.rows.body:
+; CHECK-NEXT:br label [[TILELOAD_UNROLL_COLS_HEADER:%.*]]
+; CHECK:   tileload.unroll.cols.header:
+; CHECK-NEXT:[[TILELOAD_UNROLL_COLS_IV:%.*]] = phi i16 [ 0, [[TILELOAD_UNROLL_ROWS_BODY]] ], [ [[TILELOAD_UNROLL_COLS_STEP:%.*]], [[TILELOAD_UNROLL_COLS_LATCH:%.*]] ]
+; CHECK-NEXT:[[VEC_PHI:%.*]] = phi <256 x i32> [ [[VEC_PHI_ROW]], [[TILELOAD_UNROLL_ROWS_BODY]] ], [ [[TMP11]], [[TILELOAD_UNROLL_COLS_LATCH]] ]
+; CHECK-NEXT:br label [[TILELOAD_UNROLL_COLS_BODY:%.*]]
+; CHECK:   tileload.unroll.cols.body:
+; CHECK-NEXT:[[TMP2:%.*]] = zext i16 [[TILELOAD_UNROLL_ROWS_IV]] to i64
+; CHECK-NEXT:[[TMP3:%.*]] = zext i16 [[TILELOAD_UNROLL_COLS_IV]] to i64
+; CHECK-NEXT:[[TMP4:%.*]] = mul i64 [[TMP2]], [[TMP1]]
+; CHECK-NEXT:[[TMP5:%.*]] = add i64 [[TMP4]], [[TMP3]]
+; CHECK-NEXT:[[TMP6:%.*]] = bitcast i8* [[PTR:%.*]] to i32*
+; CHECK-NEXT:[[TMP7:%.*]] = getelementptr i32, i32* [[TMP6]], i64 [[TMP5]]
+; CHECK-NEXT:[[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4
+; CHECK-NEXT:[[TMP9:%.*]] = mul i16 [[TILELOAD_UNROLL_ROWS_IV]], 16
+; CHECK-NEXT:[[TMP10:%.*]] = add i16 [[TMP9]], [[TILELOAD_UNROLL_COLS_IV]]
+; CHECK-NEXT:[[TMP11]] = insertelement <256 x i32> [[VEC_PHI]], i32 [[TMP8]], i16 [[TMP10]]
+; CHECK-NEXT:br label [[TILELOAD_UNROLL_COLS_LATCH]]
+; CHECK:   tileload.unroll.cols.latch:
+; CHECK-NEXT:[[TILELOAD_UNROLL_COLS_STEP]] = add i16 [[TILELOAD_UNROLL_COLS_IV]], 1
+; CHECK-NEXT:[[TILELOAD_UNROLL_COLS_COND:%.*]] = icmp ne i16 [[TILELOAD_UNROLL_COLS_STEP]], [[TMP0]]
+; CHECK-NEXT:br i1 [[TILELOAD_UNROLL_COLS_COND]], label [[TILELOAD_UNROLL_COLS_HEADER]], label [[TILELOAD_UNROLL_ROWS_LATCH]]
+; CHECK:   tileload.unroll.rows.latch:
+; CHECK-NEXT:[[TILELOAD_UNROLL_ROWS_STEP]] = add i16 [[TILELOAD_UNROLL_ROWS_IV]], 1
+; CHECK-NEXT:[[TILELOAD_UNROLL_ROWS_COND:%.*]] = icmp ne i16 

[PATCH] D96110: [X86] Pass to transform tdpbf16ps intrinsics to scalar operation.

2021-02-04 Thread Bing Yu via Phabricator via cfe-commits
yubing created this revision.
Herald added subscribers: pengfei, hiraditya.
yubing requested review of this revision.
Herald added projects: clang, LLVM.
Herald added subscribers: llvm-commits, cfe-commits.

Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D96110

Files:
  clang/include/clang/Basic/BuiltinsX86_64.def
  clang/lib/Headers/amxintrin.h
  llvm/include/llvm/IR/IntrinsicsX86.td
  llvm/lib/Target/X86/X86ExpandPseudo.cpp
  llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
  llvm/lib/Target/X86/X86InstrAMX.td
  llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
  llvm/lib/Target/X86/X86LowerAMXType.cpp
  llvm/lib/Target/X86/X86PreTileConfig.cpp
  llvm/lib/Target/X86/X86RegisterInfo.cpp

Index: llvm/lib/Target/X86/X86RegisterInfo.cpp
===
--- llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -873,6 +873,7 @@
   // We only collect the tile shape that is defined.
   case X86::PTILELOADDV:
   case X86::PTDPBSSDV:
+  case X86::PTDPBF16PSV:
   case X86::PTILEZEROV:
 MachineOperand  = MI->getOperand(1);
 MachineOperand  = MI->getOperand(2);
Index: llvm/lib/Target/X86/X86PreTileConfig.cpp
===
--- llvm/lib/Target/X86/X86PreTileConfig.cpp
+++ llvm/lib/Target/X86/X86PreTileConfig.cpp
@@ -127,6 +127,7 @@
 llvm_unreachable("Unexpected machine instruction on tile");
   case X86::PTILELOADDV:
   case X86::PTDPBSSDV:
+  case X86::PTDPBF16PSV:
   case X86::PTILEZEROV:
 MachineOperand  = const_cast(MI.getOperand(1));
 MachineOperand  = const_cast(MI.getOperand(2));
@@ -221,6 +222,7 @@
   case X86::PTILELOADDV:
   case X86::PTILESTOREDV:
   case X86::PTDPBSSDV:
+  case X86::PTDPBF16PSV:
   case X86::PTILEZEROV:
 return true;
   }
Index: llvm/lib/Target/X86/X86LowerAMXType.cpp
===
--- llvm/lib/Target/X86/X86LowerAMXType.cpp
+++ llvm/lib/Target/X86/X86LowerAMXType.cpp
@@ -67,7 +67,8 @@
   }
   // a * b + c
   // The shape depends on which operand.
-  case Intrinsic::x86_tdpbssd_internal: {
+  case Intrinsic::x86_tdpbssd_internal:
+  case Intrinsic::x86_tdpbf16ps_internal:{
 switch (OpNo) {
 case 3:
   Row = II->getArgOperand(0);
Index: llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
===
--- llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
+++ llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
@@ -306,6 +306,111 @@
   return NewVecC;
 }
 
+static Value *createTileDPBF16PSLoops(BasicBlock *Start, BasicBlock *End,
+IRBuilderBase , DomTreeUpdater ,
+LoopInfo , Value *Row, Value *Col,
+Value *K, Value *Acc, Value *LHS,
+Value *RHS) {
+  Loop *RowLoop = LI.AllocateLoop();
+  Loop *ColLoop = LI.AllocateLoop();
+  Loop *InnerLoop = LI.AllocateLoop();
+  ColLoop->addChildLoop(InnerLoop);
+  RowLoop->addChildLoop(ColLoop);
+  if (Loop *ParentL = LI.getLoopFor(Start))
+ParentL->addChildLoop(RowLoop);
+  else
+LI.addTopLevelLoop(RowLoop);
+
+  BasicBlock *RowBody =
+  createLoop(Start, End, Row, B.getInt16(1), "tiledpbf16ps.unroll.rows", B,
+ DTU, RowLoop, LI);
+  BasicBlock *RowLatch = RowBody->getSingleSuccessor();
+
+  BasicBlock *ColBody =
+  createLoop(RowBody, RowLatch, Col, B.getInt16(1),
+ "tiledpbf16ps.unroll.cols", B, DTU, ColLoop, LI);
+  BasicBlock *ColLoopLatch = ColBody->getSingleSuccessor();
+
+  B.SetInsertPoint(ColBody->getTerminator());
+  BasicBlock *InnerBody =
+  createLoop(ColBody, ColLoopLatch, K, B.getInt16(1),
+ "tiledpbf16ps.unroll.inner", B, DTU, InnerLoop, LI);
+
+  BasicBlock *ColumnLoopHeader = ColBody->getSinglePredecessor();
+  BasicBlock *RowLoopHeader = RowBody->getSinglePredecessor();
+  BasicBlock *InnerLoopHeader = InnerBody->getSinglePredecessor();
+  BasicBlock *InnerLoopLatch = InnerBody->getSingleSuccessor();
+  Value *CurrentRow = &*RowLoopHeader->begin();
+  Value *CurrentCol = &*ColumnLoopHeader->begin();
+  Value *CurrentInner = &*InnerLoopHeader->begin();
+
+  FixedVectorType *V256I32Ty = FixedVectorType::get(B.getInt32Ty(), 256);
+  // Type *EltTy = V256I32Ty->getElementType();
+  Value *VecC, *VecA, *VecB;
+  if (auto BitCast = dyn_cast(Acc))
+VecC = BitCast->getOperand(0);
+  assert(VecC->getType()->isVectorTy() && "bitcast from non-v256i32 to x86amx");
+  // TODO else create BitCast from x86amx to v256i32.
+  // Store x86amx to memory, and reload from memory
+  // to vector. However with -O0, it doesn't happen.
+  if (auto BitCast = dyn_cast(LHS))
+VecA = BitCast->getOperand(0);
+  assert(VecA->getType()->isVectorTy() && "bitcast from non-v256i32 to x86amx");
+  if (auto BitCast = dyn_cast(RHS))
+VecB = BitCast->getOperand(0);
+  

[PATCH] D93594: [X86] Pass to transform amx intrinsics to scalar operation.

2021-01-28 Thread Bing Yu via Phabricator via cfe-commits
yubing updated this revision to Diff 319797.
yubing added a comment.
Herald added a project: clang.
Herald added a subscriber: cfe-commits.

Fix some bugs in lowerTileDPBSSD, lowerTileStore, lowerTileLoad


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D93594/new/

https://reviews.llvm.org/D93594

Files:
  clang/lib/Headers/amxintrin.h
  llvm/include/llvm/CodeGen/Passes.h
  llvm/lib/Target/X86/CMakeLists.txt
  llvm/lib/Target/X86/X86.h
  llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
  llvm/lib/Target/X86/X86TargetMachine.cpp
  llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
  llvm/test/CodeGen/X86/O0-pipeline.ll

Index: llvm/test/CodeGen/X86/O0-pipeline.ll
===
--- llvm/test/CodeGen/X86/O0-pipeline.ll
+++ llvm/test/CodeGen/X86/O0-pipeline.ll
@@ -18,7 +18,9 @@
 ; CHECK-NEXT: Pre-ISel Intrinsic Lowering
 ; CHECK-NEXT: FunctionPass Manager
 ; CHECK-NEXT:   Expand Atomic instructions
-; CHECK-NEXT:   Lower AMX type for load/store
+; CHECK-NEXT:   Dominator Tree Construction
+; CHECK-NEXT:   Natural Loop Information
+; CHECK-NEXT:   Lower AMX intrinsics
 ; CHECK-NEXT:   Module Verifier
 ; CHECK-NEXT:   Lower Garbage Collection Instructions
 ; CHECK-NEXT:   Shadow Stack GC Lowering
Index: llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
===
--- /dev/null
+++ llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
@@ -0,0 +1,198 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -lower-amx-intrinsics %s -S | FileCheck %s
+
+define dso_local void @test_amx_load_non_O0(i16 signext %row, i16 signext %col, i8 *%ptr, i64 %stride, <256 x i32>* %vptr) {
+; CHECK-LABEL: @test_amx_load_non_O0(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:[[AMX:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[ROW:%.*]], i16 [[COL:%.*]], i8* [[PTR:%.*]], i64 [[STRIDE:%.*]])
+; CHECK-NEXT:[[VEC:%.*]] = bitcast x86_amx [[AMX]] to <256 x i32>
+; CHECK-NEXT:store <256 x i32> [[VEC]], <256 x i32>* [[VPTR:%.*]], align 64
+; CHECK-NEXT:ret void
+;
+entry:
+  %amx = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* %ptr, i64 %stride)
+  %vec = bitcast x86_amx %amx to <256 x i32>
+  store <256 x i32> %vec, <256 x i32>* %vptr, align 64
+  ret void
+}
+
+define dso_local void @test_amx_load(i16 signext %row, i16 signext %col, i8 *%ptr, i64 %stride, <256 x i32>* %vptr) #0 {
+; CHECK-LABEL: @test_amx_load(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:[[TMP0:%.*]] = udiv i16 [[COL:%.*]], 4
+; CHECK-NEXT:[[TMP1:%.*]] = udiv i64 [[STRIDE:%.*]], 4
+; CHECK-NEXT:br label [[TILELOAD_UNROLL_ROWS_HEADER:%.*]]
+; CHECK:   tileload.unroll.rows.header:
+; CHECK-NEXT:[[TILELOAD_UNROLL_ROWS_IV:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TILELOAD_UNROLL_ROWS_STEP:%.*]], [[TILELOAD_UNROLL_ROWS_LATCH:%.*]] ]
+; CHECK-NEXT:[[VEC_PHI_ROW:%.*]] = phi <256 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[TMP11:%.*]], [[TILELOAD_UNROLL_ROWS_LATCH]] ]
+; CHECK-NEXT:br label [[TILELOAD_UNROLL_ROWS_BODY:%.*]]
+; CHECK:   tileload.unroll.rows.body:
+; CHECK-NEXT:br label [[TILELOAD_UNROLL_COLS_HEADER:%.*]]
+; CHECK:   tileload.unroll.cols.header:
+; CHECK-NEXT:[[TILELOAD_UNROLL_COLS_IV:%.*]] = phi i16 [ 0, [[TILELOAD_UNROLL_ROWS_BODY]] ], [ [[TILELOAD_UNROLL_COLS_STEP:%.*]], [[TILELOAD_UNROLL_COLS_LATCH:%.*]] ]
+; CHECK-NEXT:[[VEC_PHI:%.*]] = phi <256 x i32> [ [[VEC_PHI_ROW]], [[TILELOAD_UNROLL_ROWS_BODY]] ], [ [[TMP11]], [[TILELOAD_UNROLL_COLS_LATCH]] ]
+; CHECK-NEXT:br label [[TILELOAD_UNROLL_COLS_BODY:%.*]]
+; CHECK:   tileload.unroll.cols.body:
+; CHECK-NEXT:[[TMP2:%.*]] = zext i16 [[TILELOAD_UNROLL_ROWS_IV]] to i64
+; CHECK-NEXT:[[TMP3:%.*]] = zext i16 [[TILELOAD_UNROLL_COLS_IV]] to i64
+; CHECK-NEXT:[[TMP4:%.*]] = mul i64 [[TMP2]], [[TMP1]]
+; CHECK-NEXT:[[TMP5:%.*]] = add i64 [[TMP4]], [[TMP3]]
+; CHECK-NEXT:[[TMP6:%.*]] = bitcast i8* [[PTR:%.*]] to i32*
+; CHECK-NEXT:[[TMP7:%.*]] = getelementptr i32, i32* [[TMP6]], i64 [[TMP5]]
+; CHECK-NEXT:[[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4
+; CHECK-NEXT:[[TMP9:%.*]] = mul i16 [[TILELOAD_UNROLL_ROWS_IV]], 16
+; CHECK-NEXT:[[TMP10:%.*]] = add i16 [[TMP9]], [[TILELOAD_UNROLL_COLS_IV]]
+; CHECK-NEXT:[[TMP11]] = insertelement <256 x i32> [[VEC_PHI]], i32 [[TMP8]], i16 [[TMP10]]
+; CHECK-NEXT:br label [[TILELOAD_UNROLL_COLS_LATCH]]
+; CHECK:   tileload.unroll.cols.latch:
+; CHECK-NEXT:[[TILELOAD_UNROLL_COLS_STEP]] = add i16 [[TILELOAD_UNROLL_COLS_IV]], 1
+; CHECK-NEXT:[[TILELOAD_UNROLL_COLS_COND:%.*]] = icmp ne i16 [[TILELOAD_UNROLL_COLS_STEP]], [[TMP0]]
+; CHECK-NEXT:br i1 [[TILELOAD_UNROLL_COLS_COND]], label [[TILELOAD_UNROLL_COLS_HEADER]], label [[TILELOAD_UNROLL_ROWS_LATCH]]
+; CHECK:   tileload.unroll.rows.latch:
+; CHECK-NEXT:

[PATCH] D86668: Fix Calling Convention of __float128 and long double(128bits) in i386

2020-08-27 Thread Bing Yu via Phabricator via cfe-commits
yubing updated this revision to Diff 288218.
yubing added a comment.

Modify a testcase: clang/test/CodeGenCXX/float128-declarations.cpp


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D86668/new/

https://reviews.llvm.org/D86668

Files:
  clang/lib/CodeGen/TargetInfo.cpp
  clang/test/CodeGen/x86-long-double.cpp
  clang/test/CodeGen/x86_32-fp128-call-conv-linux.c
  clang/test/CodeGenCXX/float128-declarations.cpp

Index: clang/test/CodeGenCXX/float128-declarations.cpp
===
--- clang/test/CodeGenCXX/float128-declarations.cpp
+++ clang/test/CodeGenCXX/float128-declarations.cpp
@@ -3,7 +3,7 @@
 // RUN: %clang_cc1 -emit-llvm -triple powerpc64le-unknown-unknown \
 // RUN:   -target-feature +float128 -std=c++11 %s -o - | FileCheck %s
 // RUN: %clang_cc1 -emit-llvm -triple i386-unknown-linux-gnu -std=c++11 \
-// RUN:   %s -o - | FileCheck %s -check-prefix=CHECK-X86
+// RUN:   %s -o - | FileCheck %s -check-prefix=CHECK-X86-32-LINUX
 // RUN: %clang_cc1 -emit-llvm -triple x86_64-unknown-linux-gnu -std=c++11 \
 // RUN:   %s -o - | FileCheck %s -check-prefix=CHECK-X86
 // RUN: %clang_cc1 -emit-llvm -triple i686-pc-openbsd -std=c++11 \
@@ -123,3 +123,25 @@
 // CHECK-X86-DAG: [[F4L:%[a-z0-9]+]] = load fp128, fp128* %f4l
 // CHECK-X86-DAG: [[INC:%[a-z0-9]+]] = fadd fp128 [[F4L]], 0xL3FFF
 // CHECK-X86-DAG: store fp128 [[INC]], fp128* %f4l
+
+// CHECK-X86-32-LINUX-DAG: @_ZN12_GLOBAL__N_13f1nE = internal global fp128 0xL
+// CHECK-X86-32-LINUX-DAG: @_ZN12_GLOBAL__N_13f2nE = internal global fp128 0xL40040800
+// CHECK-X86-32-LINUX-DAG: @_ZN12_GLOBAL__N_15arr1nE = internal global [10 x fp128]
+// CHECK-X86-32-LINUX-DAG: @_ZN12_GLOBAL__N_15arr2nE = internal global [3 x fp128] [fp128 0xL3FFF, fp128 0xL40008000, fp128 0xL4025176592E0]
+// CHECK-X86-32-LINUX-DAG: define internal fp128 @_ZN12_GLOBAL__N_16func1nERKg(fp128*
+// CHECK-X86-32-LINUX-DAG: @f1f = global fp128 0xL
+// CHECK-X86-32-LINUX-DAG: @f2f = global fp128 0xL40040333
+// CHECK-X86-32-LINUX-DAG: @arr1f = global [10 x fp128]
+// CHECK-X86-32-LINUX-DAG: @arr2f = global [3 x fp128] [fp128 0xLBFFF, fp128 0xLC0008000, fp128 0xLC025176592E0]
+// CHECK-X86-32-LINUX-DAG: declare fp128 @_Z6func1fg(fp128* byval(fp128) align 16) #3
+// CHECK-X86-32-LINUX-DAG: define linkonce_odr void @_ZN2C1C2Eg(%class.C1* %this, fp128* byval(fp128) align 16 %0)
+// CHECK-X86-32-LINUX-DAG: define linkonce_odr fp128 @_ZN2C16func2cEg(fp128* byval(fp128) align 16 %0)
+// CHECK-X86-32-LINUX-DAG: define linkonce_odr fp128 @_Z6func1tIgET_S0_(fp128* byval(fp128) align 16 %0)
+// CHECK-X86-32-LINUX-DAG: @__const.main.s1 = private unnamed_addr constant %struct.S1 { fp128 0xL40060800 }
+// CHECK-X86-32-LINUX-DAG: store fp128 0xLF0AFD0EBFF292DCE42E0B38CDD83F26F, fp128* %f1l, align 16
+// CHECK-X86-32-LINUX-DAG: store fp128 0xL8000, fp128* %f2l, align 16
+// CHECK-X86-32-LINUX-DAG: store fp128 0xL7FFE, fp128* %f3l, align 16
+// CHECK-X86-32-LINUX-DAG: store fp128 0xLBFFF, fp128* %f5l, align 16
+// CHECK-X86-32-LINUX-DAG: [[F4L:%[a-z0-9]+]] = load fp128, fp128* %f4l
+// CHECK-X86-32-LINUX-DAG: [[INC:%[a-z0-9]+]] = fadd fp128 [[F4L]], 0xL3FFF
+// CHECK-X86-32-LINUX-DAG: store fp128 [[INC]], fp128* %f4l
Index: clang/test/CodeGen/x86_32-fp128-call-conv-linux.c
===
--- /dev/null
+++ clang/test/CodeGen/x86_32-fp128-call-conv-linux.c
@@ -0,0 +1,28 @@
+// RUN: %clang_cc1 -mlong-double-128 -w -fblocks -ffreestanding -triple i386-pc-linux-gnu -emit-llvm -o %t %s || FileCheck < %t %s
+
+
+// CHECK-LABEL: define void @testfp128
+// CHECK-NEXT:  %a.addr = alloca fp128, align 16
+// CHECK-NEXT:  %a = load fp128, fp128* %0, align 16
+// CHECK-NEXT:  store fp128 %a, fp128* %a.addr, align 16
+void testfp128(__float128 a) {
+  return;
+}
+
+// CHECK-LABEL: define void @testlongdouble
+// CHECK-NEXT:  %a.addr = alloca fp128, align 16
+// CHECK-NEXT:  %a = load fp128, fp128* %0, align 16
+// CHECK-NEXT:  store fp128 %a, fp128* %a.addr, align 16
+void testlongdouble(long double a) {
+  return;
+}
+
+// CHECK-LABEL: define void @testPassArguments
+// CHECK:   call void @testfp128(fp128* byval(fp128) align 16 %{{.*}})
+// CHECK:   call void @testlongdouble(fp128* byval(fp128) align 16 %{{.*}})
+void testPassArguments() {
+  __float128 a=1.0;
+  testfp128(a);
+  testlongdouble(a);
+  return;
+}
Index: clang/test/CodeGen/x86-long-double.cpp
===
--- 

[PATCH] D86668: Fix Calling Convention of __float128 and long double(128bits) in i386

2020-08-26 Thread Bing Yu via Phabricator via cfe-commits
yubing created this revision.
yubing added reviewers: craig.topper, LuoYuanke, LiuChen3.
Herald added a project: clang.
Herald added a subscriber: cfe-commits.
yubing requested review of this revision.

This patch make  __float128/long double(128bits) passed on the stack with 
16-byte alignment, accoding to i386 System V ABI.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D86668

Files:
  clang/lib/CodeGen/TargetInfo.cpp
  clang/test/CodeGen/x86-long-double.cpp
  clang/test/CodeGen/x86_32-fp128-call-conv-linux.c

Index: clang/test/CodeGen/x86_32-fp128-call-conv-linux.c
===
--- /dev/null
+++ clang/test/CodeGen/x86_32-fp128-call-conv-linux.c
@@ -0,0 +1,28 @@
+// RUN: %clang_cc1 -mlong-double-128 -w -fblocks -ffreestanding -triple i386-pc-linux-gnu -emit-llvm -o %t %s || FileCheck < %t %s
+
+
+// CHECK-LABEL: define void @testfp128
+// CHECK-NEXT:  %a.addr = alloca fp128, align 16
+// CHECK-NEXT:  %a = load fp128, fp128* %0, align 16
+// CHECK-NEXT:  store fp128 %a, fp128* %a.addr, align 16
+void testfp128(__float128 a) {
+  return;
+}
+
+// CHECK-LABEL: define void @testlongdouble
+// CHECK-NEXT:  %a.addr = alloca fp128, align 16
+// CHECK-NEXT:  %a = load fp128, fp128* %0, align 16
+// CHECK-NEXT:  store fp128 %a, fp128* %a.addr, align 16
+void testlongdouble(long double a) {
+  return;
+}
+
+// CHECK-LABEL: define void @testPassArguments
+// CHECK:   call void @testfp128(fp128* byval(fp128) align 16 %{{.*}})
+// CHECK:   call void @testlongdouble(fp128* byval(fp128) align 16 %{{.*}})
+void testPassArguments() {
+  __float128 a=1.0;
+  testfp128(a);
+  testlongdouble(a);
+  return;
+}
Index: clang/test/CodeGen/x86-long-double.cpp
===
--- clang/test/CodeGen/x86-long-double.cpp
+++ clang/test/CodeGen/x86-long-double.cpp
@@ -16,14 +16,14 @@
 // RUN: %clang_cc1 %s -emit-llvm -o - -triple=x86_64-apple-darwin -mlong-double-64 | \
 // RUN:   FileCheck --check-prefixes=FP64,FP64-X64 %s
 
-// RUN: %clang_cc1 %s -emit-llvm -o - -triple=i686 -mlong-double-128 | \
-// RUN:   FileCheck --check-prefix=FP128 %s
+// RUN: %clang_cc1 %s -emit-llvm -o - -triple=i386-pc-linux-gnu -mlong-double-128 | \
+// RUN:   FileCheck --check-prefixes=FP128,FP128-X32-LINUX %s
 // RUN: %clang_cc1 %s -emit-llvm -o - -triple=i686-apple-darwin -mlong-double-128 | \
-// RUN:   FileCheck --check-prefix=FP128 %s
+// RUN:   FileCheck --check-prefixes=FP128,FP128-X32-DARWIN %s
 // RUN: %clang_cc1 %s -emit-llvm -o - -triple=x86_64 -mlong-double-128 | \
-// RUN:   FileCheck --check-prefix=FP128 %s
+// RUN:   FileCheck --check-prefixes=FP128,FP128-X64 %s
 // RUN: %clang_cc1 %s -emit-llvm -o - -triple=x86_64-apple-darwin -mlong-double-128 | \
-// RUN:   FileCheck --check-prefix=FP128 %s
+// RUN:   FileCheck --check-prefixes=FP128,FP128-X64 %s
 
 // Check -malign-double increases the alignment from 4 to 8 on x86-32.
 // RUN: %clang_cc1 %s -emit-llvm -o - -triple=i686 -mlong-double-64 \
@@ -53,4 +53,6 @@
 
 // FP64: double @_Z3fooe(double %d)
 // FP80: x86_fp80 @_Z3fooe(x86_fp80 %d)
-// FP128: fp128 @_Z3foog(fp128 %d)
+// FP128-X32-LINUX: fp128 @_Z3foog(fp128* byval(fp128) align 16 %0)
+// FP128-X32-DARWIN: fp128 @_Z3foog(fp128 %d)
+// FP128-X64: fp128 @_Z3foog(fp128 %d)
Index: clang/lib/CodeGen/TargetInfo.cpp
===
--- clang/lib/CodeGen/TargetInfo.cpp
+++ clang/lib/CodeGen/TargetInfo.cpp
@@ -1108,6 +1108,7 @@
   bool IsWin32StructABI;
   bool IsSoftFloatABI;
   bool IsMCUABI;
+  bool IsLinuxABI;
   unsigned DefaultNumRegisterParameters;
 
   static bool isRegisterSize(unsigned Size) {
@@ -1173,6 +1174,7 @@
   IsWin32StructABI(Win32StructABI),
   IsSoftFloatABI(SoftFloatABI),
   IsMCUABI(CGT.getTarget().getTriple().isOSIAMCU()),
+  IsLinuxABI(CGT.getTarget().getTriple().isOSLinux()),
   DefaultNumRegisterParameters(NumRegisterParameters) {}
 
   bool shouldPassIndirectlyForSwift(ArrayRef scalars,
@@ -1597,6 +1599,9 @@
   if (Align <= MinABIStackAlignInBytes)
 return 0; // Use default alignment.
 
+  if (IsLinuxABI && Ty->isFloatingType() && getContext().getTypeSize(Ty) == 128)
+return 16;
+
   // On non-Darwin, the stack type alignment is always 4.
   if (!IsDarwinVectorABI) {
 // Set explicit alignment, since we may need to realign the top.
@@ -1890,6 +1895,9 @@
 return getIndirectResult(Ty, /*ByVal=*/false, State);
   }
 
+  if (IsLinuxABI && Ty->isFloatingType() && getContext().getTypeSize(Ty) == 128)
+return getIndirectResult(Ty, /*ByVal=*/true, State);
+
   if (InReg)
 return ABIArgInfo::getDirectInReg();
   return ABIArgInfo::getDirect();
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits