[PATCH] D47724: [X86] Add back _mask, _maskz, and _mask3 builtins for some 512-bit fmadd/fmsub/fmaddsub/fmsubadd builtins.

2018-06-06 Thread Phabricator via Phabricator via cfe-commits
This revision was automatically updated to reflect the committed changes.
Closed by commit rL334159: [X86] Add back _mask, _maskz, and _mask3 builtins 
for some 512-bit… (authored by ctopper, committed by ).
Herald added a subscriber: llvm-commits.

Repository:
  rL LLVM

https://reviews.llvm.org/D47724

Files:
  cfe/trunk/include/clang/Basic/BuiltinsX86.def
  cfe/trunk/lib/CodeGen/CGBuiltin.cpp
  cfe/trunk/lib/Headers/avx512fintrin.h
  cfe/trunk/lib/Sema/SemaChecking.cpp
  cfe/trunk/test/CodeGen/avx512f-builtins.c

Index: cfe/trunk/lib/CodeGen/CGBuiltin.cpp
===
--- cfe/trunk/lib/CodeGen/CGBuiltin.cpp
+++ cfe/trunk/lib/CodeGen/CGBuiltin.cpp
@@ -8555,79 +8555,110 @@
 
 // Lowers X86 FMA intrinsics to IR.
 static Value *EmitX86FMAExpr(CodeGenFunction &CGF, ArrayRef Ops,
- unsigned BuiltinID) {
+ unsigned BuiltinID, bool IsAddSub) {
 
-  bool IsAddSub = false;
-  bool IsScalar = false;
-
-  // 4 operands always means rounding mode without a mask here.
-  bool IsRound = Ops.size() == 4;
-
-  Intrinsic::ID ID;
+  bool Subtract = false;
+  Intrinsic::ID IID = Intrinsic::not_intrinsic;
   switch (BuiltinID) {
   default: break;
-  case clang::X86::BI__builtin_ia32_vfmaddss3: IsScalar = true; break;
-  case clang::X86::BI__builtin_ia32_vfmaddsd3: IsScalar = true; break;
-  case clang::X86::BI__builtin_ia32_vfmaddps512:
-ID = llvm::Intrinsic::x86_avx512_vfmadd_ps_512; break;
-  case clang::X86::BI__builtin_ia32_vfmaddpd512:
-ID = llvm::Intrinsic::x86_avx512_vfmadd_pd_512; break;
-  case clang::X86::BI__builtin_ia32_vfmaddsubps: IsAddSub = true; break;
-  case clang::X86::BI__builtin_ia32_vfmaddsubpd: IsAddSub = true; break;
-  case clang::X86::BI__builtin_ia32_vfmaddsubps256: IsAddSub = true; break;
-  case clang::X86::BI__builtin_ia32_vfmaddsubpd256: IsAddSub = true; break;
-  case clang::X86::BI__builtin_ia32_vfmaddsubps512: {
-ID = llvm::Intrinsic::x86_avx512_vfmaddsub_ps_512;
-IsAddSub = true;
+  case clang::X86::BI__builtin_ia32_vfmsubps512_mask3:
+Subtract = true;
+LLVM_FALLTHROUGH;
+  case clang::X86::BI__builtin_ia32_vfmaddps512_mask:
+  case clang::X86::BI__builtin_ia32_vfmaddps512_maskz:
+  case clang::X86::BI__builtin_ia32_vfmaddps512_mask3:
+IID = llvm::Intrinsic::x86_avx512_vfmadd_ps_512; break;
+  case clang::X86::BI__builtin_ia32_vfmsubpd512_mask3:
+Subtract = true;
+LLVM_FALLTHROUGH;
+  case clang::X86::BI__builtin_ia32_vfmaddpd512_mask:
+  case clang::X86::BI__builtin_ia32_vfmaddpd512_maskz:
+  case clang::X86::BI__builtin_ia32_vfmaddpd512_mask3:
+IID = llvm::Intrinsic::x86_avx512_vfmadd_pd_512; break;
+  case clang::X86::BI__builtin_ia32_vfmsubaddps512_mask3:
+Subtract = true;
+LLVM_FALLTHROUGH;
+  case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask:
+  case clang::X86::BI__builtin_ia32_vfmaddsubps512_maskz:
+  case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask3:
+IID = llvm::Intrinsic::x86_avx512_vfmaddsub_ps_512;
 break;
-  }
-  case clang::X86::BI__builtin_ia32_vfmaddsubpd512: {
-ID = llvm::Intrinsic::x86_avx512_vfmaddsub_pd_512;
-IsAddSub = true;
+  case clang::X86::BI__builtin_ia32_vfmsubaddpd512_mask3:
+Subtract = true;
+LLVM_FALLTHROUGH;
+  case clang::X86::BI__builtin_ia32_vfmaddsubpd512_mask:
+  case clang::X86::BI__builtin_ia32_vfmaddsubpd512_maskz:
+  case clang::X86::BI__builtin_ia32_vfmaddsubpd512_mask3:
+IID = llvm::Intrinsic::x86_avx512_vfmaddsub_pd_512;
 break;
   }
-  }
-
-  // Only handle in case of _MM_FROUND_CUR_DIRECTION/4 (no rounding).
-  if (IsRound) {
-Function *Intr = CGF.CGM.getIntrinsic(ID);
-if (cast(Ops[3])->getZExtValue() != (uint64_t)4)
-  return CGF.Builder.CreateCall(Intr, Ops);
-  }
 
   Value *A = Ops[0];
   Value *B = Ops[1];
   Value *C = Ops[2];
 
-  if (IsScalar) {
-A = CGF.Builder.CreateExtractElement(A, (uint64_t)0);
-B = CGF.Builder.CreateExtractElement(B, (uint64_t)0);
-C = CGF.Builder.CreateExtractElement(C, (uint64_t)0);
-  }
-
-  llvm::Type *Ty = A->getType();
-  Function *FMA = CGF.CGM.getIntrinsic(Intrinsic::fma, Ty);
-  Value *Res = CGF.Builder.CreateCall(FMA, {A, B, C} );
-
-  if (IsScalar)
-return CGF.Builder.CreateInsertElement(Ops[0], Res, (uint64_t)0);
-
-  if (IsAddSub) {
-// Negate even elts in C using a mask.
-unsigned NumElts = Ty->getVectorNumElements();
-SmallVector NMask;
-Constant *Zero = ConstantInt::get(CGF.Builder.getInt1Ty(), 0);
-Constant *One = ConstantInt::get(CGF.Builder.getInt1Ty(), 1);
-for (unsigned i = 0; i < NumElts; ++i) {
-  NMask.push_back(i % 2 == 0 ? One : Zero);
-}
-Value *NegMask = ConstantVector::get(NMask);
-
-Value *NegC = CGF.Builder.CreateFNeg(C);
-Value *FMSub = CGF.Builder.CreateCall(FMA, {A, B, NegC} );
-Res = CGF.Builder.CreateSelect(NegMask, FMSub, Res);
+  if (Subtract)
+C = CGF.Builder.CreateFNeg(C);
+
+  Value *Res;
+
+  //

[PATCH] D47724: [X86] Add back _mask, _maskz, and _mask3 builtins for some 512-bit fmadd/fmsub/fmaddsub/fmsubadd builtins.

2018-06-06 Thread Tomasz Krupa via Phabricator via cfe-commits
tkrupa accepted this revision.
tkrupa added a comment.
This revision is now accepted and ready to land.

LGTM


Repository:
  rC Clang

https://reviews.llvm.org/D47724



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D47724: [X86] Add back _mask, _maskz, and _mask3 builtins for some 512-bit fmadd/fmsub/fmaddsub/fmsubadd builtins.

2018-06-05 Thread Craig Topper via Phabricator via cfe-commits
craig.topper updated this revision to Diff 150027.
craig.topper added a comment.

Add subtract builtins to SemaChecking.cpp


Repository:
  rC Clang

https://reviews.llvm.org/D47724

Files:
  include/clang/Basic/BuiltinsX86.def
  lib/CodeGen/CGBuiltin.cpp
  lib/Headers/avx512fintrin.h
  lib/Sema/SemaChecking.cpp
  test/CodeGen/avx512f-builtins.c

Index: test/CodeGen/avx512f-builtins.c
===
--- test/CodeGen/avx512f-builtins.c
+++ test/CodeGen/avx512f-builtins.c
@@ -461,7 +461,7 @@
   // CHECK-LABEL: @test_mm512_maskz_fmadd_round_pd
   // CHECK: @llvm.x86.avx512.vfmadd.pd.512
   // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> zeroinitializer
   return _mm512_maskz_fmadd_round_pd(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512d test_mm512_fmsub_round_pd(__m512d __A, __m512d __B, __m512d __C) {
@@ -483,7 +483,7 @@
   // CHECK: fsub <8 x double> 
   // CHECK: @llvm.x86.avx512.vfmadd.pd.512
   // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> zeroinitializer
   return _mm512_maskz_fmsub_round_pd(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512d test_mm512_fnmadd_round_pd(__m512d __A, __m512d __B, __m512d __C) {
@@ -505,7 +505,7 @@
   // CHECK: fsub <8 x double> 
   // CHECK: @llvm.x86.avx512.vfmadd.pd.512
   // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> zeroinitializer
   return _mm512_maskz_fnmadd_round_pd(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512d test_mm512_fnmsub_round_pd(__m512d __A, __m512d __B, __m512d __C) {
@@ -521,7 +521,7 @@
   // CHECK: fsub <8 x double> 
   // CHECK: @llvm.x86.avx512.vfmadd.pd.512
   // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> zeroinitializer
   return _mm512_maskz_fnmsub_round_pd(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512d test_mm512_fmadd_pd(__m512d __A, __m512d __B, __m512d __C) {
@@ -547,7 +547,7 @@
   // CHECK-LABEL: @test_mm512_maskz_fmadd_pd
   // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}})
   // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> zeroinitializer
   return _mm512_maskz_fmadd_pd(__U, __A, __B, __C);
 }
 __m512d test_mm512_fmsub_pd(__m512d __A, __m512d __B, __m512d __C) {
@@ -569,7 +569,7 @@
   // CHECK: fsub <8 x double> , %{{.*}}
   // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}})
   // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> zeroinitializer
   return _mm512_maskz_fmsub_pd(__U, __A, __B, __C);
 }
 __m512d test_mm512_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C) {
@@ -591,7 +591,7 @@
   // CHECK: fsub <8 x double> , %{{.*}}
   // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}})
   // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> zeroinitializer
   return _mm512_maskz_fnmadd_pd(__U, __A, __B, __C);
 }
 __m512d test_mm512_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C) {
@@ -607,7 +607,7 @@
   // CHECK: fsub <8 x double> , %{{.*}}
   // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}})
   // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> zeroinitializer
   return _mm512_maskz_fnmsub_pd(__U, __A, __B, __C);
 }
 __m512 test_mm512_fmadd_round_ps(__m512 __A, __m512 __B, __m512 __C) {
@@ -633,7 +633,7 @@
   // CHECK-LABEL: @test_mm512_maskz_fmadd_round_ps
   // CHECK: @llvm.x86.avx512.vfmadd.ps.512
   // CHECK: bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> zeroinitializer
   return _mm512_maskz_fmadd_round_ps(__U, __A, __B, __C, _MM_FROUND_TO_

[PATCH] D47724: [X86] Add back _mask, _maskz, and _mask3 builtins for some 512-bit fmadd/fmsub/fmaddsub/fmsubadd builtins.

2018-06-05 Thread Tomasz Krupa via Phabricator via cfe-commits
tkrupa added inline comments.



Comment at: lib/Sema/SemaChecking.cpp:2388
+  case X86::BI__builtin_ia32_vfmaddsubps512_maskz:
+  case X86::BI__builtin_ia32_vfmaddsubps512_mask3:
 ArgNum = 4;

vfmsubps512_mask3, vfmsubpd512_mask3, vfmsubaddps512_mask3 and 
vfmsubaddpd512_mask3 are missing.


Repository:
  rC Clang

https://reviews.llvm.org/D47724



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D47724: [X86] Add back _mask, _maskz, and _mask3 builtins for some 512-bit fmadd/fmsub/fmaddsub/fmsubadd builtins.

2018-06-04 Thread Craig Topper via Phabricator via cfe-commits
craig.topper created this revision.
craig.topper added reviewers: tkrupa, RKSimon, spatel, GBuella.

We recently switch to using a selects in the intrinsics header files for FMA 
instructions. But the 512-bit versions support flavors with rounding mode which 
must be an Integer Constant Expression. This has forced those intrinsics to be 
implemented as macros. As it stands now the mask and mask3 intrinsics evaluate 
one of their macro arguments twice. If that argument itself is another 
intrinsic macro, we can end up over expanding macros. Or if its something we 
can CSE later it would show up multiple times when it shouldn't.

I tried adding __extension__ around the macro and making it an expression 
statement and declaring a local variable. But whatever name you choose for the 
local variable can never be used as the name of an input to the macro in user 
code. If that happens you would end up with the same name on the LHS and RHS of 
an assignment after expansion. We might be safe if we use __ in front of the 
variable names because those names are reserved and user code shouldn't use 
that, but I wasn't sure I wanted to make that claim.

The other option which I've chosen here, is to add back _mask, _maskz, and 
_mask3 flavors of the builtin which we will expand in CGBuiltin.cpp to 
replicate the argument as needed and insert any fneg needed on the third 
operand to make a subtract. The _maskz isn't truly necessary if we have an 
unmasked version or if we use the masked version with a -1 mask and wrap a 
select around it. But I've chosen to make things more uniform.

I separated out the scalar builtin handling to avoid too many things going on 
in EmitX86FMAExpr. It was different enough due to the extract and insert that 
the minor duplication of the CreateCall was probably worth it.


Repository:
  rC Clang

https://reviews.llvm.org/D47724

Files:
  include/clang/Basic/BuiltinsX86.def
  lib/CodeGen/CGBuiltin.cpp
  lib/Headers/avx512fintrin.h
  lib/Sema/SemaChecking.cpp
  test/CodeGen/avx512f-builtins.c

Index: test/CodeGen/avx512f-builtins.c
===
--- test/CodeGen/avx512f-builtins.c
+++ test/CodeGen/avx512f-builtins.c
@@ -461,7 +461,7 @@
   // CHECK-LABEL: @test_mm512_maskz_fmadd_round_pd
   // CHECK: @llvm.x86.avx512.vfmadd.pd.512
   // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> zeroinitializer
   return _mm512_maskz_fmadd_round_pd(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512d test_mm512_fmsub_round_pd(__m512d __A, __m512d __B, __m512d __C) {
@@ -483,7 +483,7 @@
   // CHECK: fsub <8 x double> 
   // CHECK: @llvm.x86.avx512.vfmadd.pd.512
   // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> zeroinitializer
   return _mm512_maskz_fmsub_round_pd(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512d test_mm512_fnmadd_round_pd(__m512d __A, __m512d __B, __m512d __C) {
@@ -505,7 +505,7 @@
   // CHECK: fsub <8 x double> 
   // CHECK: @llvm.x86.avx512.vfmadd.pd.512
   // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> zeroinitializer
   return _mm512_maskz_fnmadd_round_pd(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512d test_mm512_fnmsub_round_pd(__m512d __A, __m512d __B, __m512d __C) {
@@ -521,7 +521,7 @@
   // CHECK: fsub <8 x double> 
   // CHECK: @llvm.x86.avx512.vfmadd.pd.512
   // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> zeroinitializer
   return _mm512_maskz_fnmsub_round_pd(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512d test_mm512_fmadd_pd(__m512d __A, __m512d __B, __m512d __C) {
@@ -547,7 +547,7 @@
   // CHECK-LABEL: @test_mm512_maskz_fmadd_pd
   // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}})
   // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> zeroinitializer
   return _mm512_maskz_fmadd_pd(__U, __A, __B, __C);
 }
 __m512d test_mm512_fmsub_pd(__m512d __A, __m512d __B, __m512d __C) {
@@ -569,7 +569,7 @@
   // CHECK: fsub <8 x double> , %{{.*}}
   // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}})
   // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8