pengfei updated this revision to Diff 238408.
pengfei added a comment.
Remove dead code.
Repository:
rG LLVM Github Monorepo
CHANGES SINCE LAST ACTION
https://reviews.llvm.org/D72820/new/
https://reviews.llvm.org/D72820
Files:
clang/lib/CodeGen/CGExprScalar.cpp
clang/test/CodeGen/constrained-math-builtins.c
llvm/docs/LangRef.rst
llvm/include/llvm/CodeGen/BasicTTIImpl.h
llvm/include/llvm/CodeGen/ISDOpcodes.h
llvm/include/llvm/IR/ConstrainedOps.def
llvm/include/llvm/IR/Intrinsics.td
llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/lib/Target/X86/X86ISelLowering.h
llvm/lib/Target/X86/X86InstrAVX512.td
llvm/lib/Target/X86/X86InstrFMA.td
llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
llvm/test/CodeGen/X86/fp-intrinsics-fma.ll
Index: llvm/test/CodeGen/X86/fp-intrinsics-fma.ll
===================================================================
--- llvm/test/CodeGen/X86/fp-intrinsics-fma.ll
+++ llvm/test/CodeGen/X86/fp-intrinsics-fma.ll
@@ -1,7 +1,339 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -O3 -mtriple=x86_64-pc-linux < %s | FileCheck %s --check-prefixes=COMMON,NOFMA
-; RUN: llc -O3 -mtriple=x86_64-pc-linux -mattr=+fma < %s | FileCheck %s --check-prefixes=COMMON,FMA
-; RUN: llc -O3 -mtriple=x86_64-pc-linux -mattr=+avx512f < %s | FileCheck %s --check-prefixes=COMMON,FMA
+; RUN: llc -O3 -mtriple=x86_64-pc-linux -mattr=+fma < %s | FileCheck %s --check-prefixes=COMMON,FMA,FMA-AVX1
+; RUN: llc -O3 -mtriple=x86_64-pc-linux -mattr=+avx512f < %s | FileCheck %s --check-prefixes=COMMON,FMA,FMA-AVX512
+
+define float @f1(float %0, float %1, float %2) #0 {
+; NOFMA-LABEL: f1:
+; NOFMA: # %bb.0: # %entry
+; NOFMA-NEXT: xorps {{.*}}(%rip), %xmm0
+; NOFMA-NEXT: mulss %xmm1, %xmm0
+; NOFMA-NEXT: addss %xmm2, %xmm0
+; NOFMA-NEXT: retq
+;
+; FMA-LABEL: f1:
+; FMA: # %bb.0: # %entry
+; FMA-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
+; FMA-NEXT: retq
+entry:
+ %3 = fneg float %0
+ %result = call float @llvm.experimental.constrained.fmuladd.f32(float %3, float %1, float %2,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ ret float %result
+}
+
+define double @f2(double %0, double %1, double %2) #0 {
+; NOFMA-LABEL: f2:
+; NOFMA: # %bb.0: # %entry
+; NOFMA-NEXT: xorpd {{.*}}(%rip), %xmm0
+; NOFMA-NEXT: mulsd %xmm1, %xmm0
+; NOFMA-NEXT: addsd %xmm2, %xmm0
+; NOFMA-NEXT: retq
+;
+; FMA-LABEL: f2:
+; FMA: # %bb.0: # %entry
+; FMA-NEXT: vfnmadd213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
+; FMA-NEXT: retq
+entry:
+ %3 = fneg double %0
+ %result = call double @llvm.experimental.constrained.fmuladd.f64(double %3, double %1, double %2,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ ret double %result
+}
+
+define float @f3(float %0, float %1, float %2) #0 {
+; NOFMA-LABEL: f3:
+; NOFMA: # %bb.0: # %entry
+; NOFMA-NEXT: xorps {{.*}}(%rip), %xmm2
+; NOFMA-NEXT: mulss %xmm1, %xmm0
+; NOFMA-NEXT: addss %xmm2, %xmm0
+; NOFMA-NEXT: retq
+;
+; FMA-LABEL: f3:
+; FMA: # %bb.0: # %entry
+; FMA-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
+; FMA-NEXT: retq
+entry:
+ %3 = fneg float %2
+ %result = call float @llvm.experimental.constrained.fmuladd.f32(float %0, float %1, float %3,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ ret float %result
+}
+
+define double @f4(double %0, double %1, double %2) #0 {
+; NOFMA-LABEL: f4:
+; NOFMA: # %bb.0: # %entry
+; NOFMA-NEXT: xorpd {{.*}}(%rip), %xmm2
+; NOFMA-NEXT: mulsd %xmm1, %xmm0
+; NOFMA-NEXT: addsd %xmm2, %xmm0
+; NOFMA-NEXT: retq
+;
+; FMA-LABEL: f4:
+; FMA: # %bb.0: # %entry
+; FMA-NEXT: vfmsub213sd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
+; FMA-NEXT: retq
+entry:
+ %3 = fneg double %2
+ %result = call double @llvm.experimental.constrained.fmuladd.f64(double %0, double %1, double %3,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ ret double %result
+}
+
+define float @f5(float %0, float %1, float %2) #0 {
+; NOFMA-LABEL: f5:
+; NOFMA: # %bb.0: # %entry
+; NOFMA-NEXT: movaps {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; NOFMA-NEXT: xorps %xmm3, %xmm0
+; NOFMA-NEXT: xorps %xmm3, %xmm2
+; NOFMA-NEXT: mulss %xmm1, %xmm0
+; NOFMA-NEXT: addss %xmm2, %xmm0
+; NOFMA-NEXT: retq
+;
+; FMA-LABEL: f5:
+; FMA: # %bb.0: # %entry
+; FMA-NEXT: vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
+; FMA-NEXT: retq
+entry:
+ %3 = fneg float %0
+ %4 = fneg float %2
+ %result = call float @llvm.experimental.constrained.fmuladd.f32(float %3, float %1, float %4,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ ret float %result
+}
+
+define double @f6(double %0, double %1, double %2) #0 {
+; NOFMA-LABEL: f6:
+; NOFMA: # %bb.0: # %entry
+; NOFMA-NEXT: movapd {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0]
+; NOFMA-NEXT: xorpd %xmm3, %xmm0
+; NOFMA-NEXT: xorpd %xmm3, %xmm2
+; NOFMA-NEXT: mulsd %xmm1, %xmm0
+; NOFMA-NEXT: addsd %xmm2, %xmm0
+; NOFMA-NEXT: retq
+;
+; FMA-LABEL: f6:
+; FMA: # %bb.0: # %entry
+; FMA-NEXT: vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
+; FMA-NEXT: retq
+entry:
+ %3 = fneg double %0
+ %4 = fneg double %2
+ %result = call double @llvm.experimental.constrained.fmuladd.f64(double %3, double %1, double %4,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ ret double %result
+}
+
+define float @f7(float %0, float %1, float %2) #0 {
+; NOFMA-LABEL: f7:
+; NOFMA: # %bb.0: # %entry
+; NOFMA-NEXT: mulss %xmm1, %xmm0
+; NOFMA-NEXT: addss %xmm2, %xmm0
+; NOFMA-NEXT: xorps {{.*}}(%rip), %xmm0
+; NOFMA-NEXT: retq
+;
+; FMA-AVX1-LABEL: f7:
+; FMA-AVX1: # %bb.0: # %entry
+; FMA-AVX1-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
+; FMA-AVX1-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0
+; FMA-AVX1-NEXT: retq
+;
+; FMA-AVX512-LABEL: f7:
+; FMA-AVX512: # %bb.0: # %entry
+; FMA-AVX512-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
+; FMA-AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; FMA-AVX512-NEXT: vxorps %xmm1, %xmm0, %xmm0
+; FMA-AVX512-NEXT: retq
+entry:
+ %3 = call float @llvm.experimental.constrained.fmuladd.f32(float %0, float %1, float %2,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ %result = fneg float %3
+ ret float %result
+}
+
+define double @f8(double %0, double %1, double %2) #0 {
+; NOFMA-LABEL: f8:
+; NOFMA: # %bb.0: # %entry
+; NOFMA-NEXT: mulsd %xmm1, %xmm0
+; NOFMA-NEXT: addsd %xmm2, %xmm0
+; NOFMA-NEXT: xorpd {{.*}}(%rip), %xmm0
+; NOFMA-NEXT: retq
+;
+; FMA-LABEL: f8:
+; FMA: # %bb.0: # %entry
+; FMA-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
+; FMA-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0
+; FMA-NEXT: retq
+entry:
+ %3 = call double @llvm.experimental.constrained.fmuladd.f64(double %0, double %1, double %2,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ %result = fneg double %3
+ ret double %result
+}
+
+define float @f9(float %0, float %1, float %2) #0 {
+; NOFMA-LABEL: f9:
+; NOFMA: # %bb.0: # %entry
+; NOFMA-NEXT: movaps {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; NOFMA-NEXT: xorps %xmm3, %xmm0
+; NOFMA-NEXT: xorps %xmm3, %xmm2
+; NOFMA-NEXT: mulss %xmm1, %xmm0
+; NOFMA-NEXT: addss %xmm2, %xmm0
+; NOFMA-NEXT: xorps %xmm3, %xmm0
+; NOFMA-NEXT: retq
+;
+; FMA-AVX1-LABEL: f9:
+; FMA-AVX1: # %bb.0: # %entry
+; FMA-AVX1-NEXT: vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
+; FMA-AVX1-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0
+; FMA-AVX1-NEXT: retq
+;
+; FMA-AVX512-LABEL: f9:
+; FMA-AVX512: # %bb.0: # %entry
+; FMA-AVX512-NEXT: vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
+; FMA-AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; FMA-AVX512-NEXT: vxorps %xmm1, %xmm0, %xmm0
+; FMA-AVX512-NEXT: retq
+entry:
+ %3 = fneg float %0
+ %4 = fneg float %2
+ %5 = call float @llvm.experimental.constrained.fmuladd.f32(float %3, float %1, float %4,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ %result = fneg float %5
+ ret float %result
+}
+
+define double @f10(double %0, double %1, double %2) #0 {
+; NOFMA-LABEL: f10:
+; NOFMA: # %bb.0: # %entry
+; NOFMA-NEXT: movapd {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0]
+; NOFMA-NEXT: xorpd %xmm3, %xmm0
+; NOFMA-NEXT: xorpd %xmm3, %xmm2
+; NOFMA-NEXT: mulsd %xmm1, %xmm0
+; NOFMA-NEXT: addsd %xmm2, %xmm0
+; NOFMA-NEXT: xorpd %xmm3, %xmm0
+; NOFMA-NEXT: retq
+;
+; FMA-LABEL: f10:
+; FMA: # %bb.0: # %entry
+; FMA-NEXT: vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
+; FMA-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0
+; FMA-NEXT: retq
+entry:
+ %3 = fneg double %0
+ %4 = fneg double %2
+ %5 = call double @llvm.experimental.constrained.fmuladd.f64(double %3, double %1, double %4,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ %result = fneg double %5
+ ret double %result
+}
+
+; Verify constrained fmul and fadd aren't fused.
+define float @f11(float %0, float %1, float %2) #0 {
+; NOFMA-LABEL: f11:
+; NOFMA: # %bb.0: # %entry
+; NOFMA-NEXT: mulss %xmm1, %xmm0
+; NOFMA-NEXT: addss %xmm2, %xmm0
+; NOFMA-NEXT: retq
+;
+; FMA-LABEL: f11:
+; FMA: # %bb.0: # %entry
+; FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0
+; FMA-NEXT: vaddss %xmm2, %xmm0, %xmm0
+; FMA-NEXT: retq
+entry:
+ %3 = call float @llvm.experimental.constrained.fmul.f32(float %0, float %1,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ %4 = call float @llvm.experimental.constrained.fadd.f32(float %3, float %2,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ ret float %4
+}
+
+; Verify constrained fmul and fadd aren't fused.
+define double @f12(double %0, double %1, double %2) #0 {
+; NOFMA-LABEL: f12:
+; NOFMA: # %bb.0: # %entry
+; NOFMA-NEXT: mulsd %xmm1, %xmm0
+; NOFMA-NEXT: addsd %xmm2, %xmm0
+; NOFMA-NEXT: retq
+;
+; FMA-LABEL: f12:
+; FMA: # %bb.0: # %entry
+; FMA-NEXT: vmulsd %xmm1, %xmm0, %xmm0
+; FMA-NEXT: vaddsd %xmm2, %xmm0, %xmm0
+; FMA-NEXT: retq
+entry:
+ %3 = call double @llvm.experimental.constrained.fmul.f64(double %0, double %1,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ %4 = call double @llvm.experimental.constrained.fadd.f64(double %3, double %2,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ ret double %4
+}
+
+; Verify that fmuladd(3.5) isn't simplified when the rounding mode is
+; unknown.
+define float @f15() #0 {
+; NOFMA-LABEL: f15:
+; NOFMA: # %bb.0: # %entry
+; NOFMA-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; NOFMA-NEXT: movaps %xmm1, %xmm0
+; NOFMA-NEXT: mulss %xmm1, %xmm0
+; NOFMA-NEXT: addss %xmm1, %xmm0
+; NOFMA-NEXT: retq
+;
+; FMA-LABEL: f15:
+; FMA: # %bb.0: # %entry
+; FMA-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; FMA-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm0 * xmm0) + xmm0
+; FMA-NEXT: retq
+entry:
+ %result = call float @llvm.experimental.constrained.fmuladd.f32(
+ float 3.5,
+ float 3.5,
+ float 3.5,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ ret float %result
+}
+
+; Verify that fmuladd(42.1) isn't simplified when the rounding mode is
+; unknown.
+define double @f16() #0 {
+; NOFMA-LABEL: f16:
+; NOFMA: # %bb.0: # %entry
+; NOFMA-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; NOFMA-NEXT: movapd %xmm1, %xmm0
+; NOFMA-NEXT: mulsd %xmm1, %xmm0
+; NOFMA-NEXT: addsd %xmm1, %xmm0
+; NOFMA-NEXT: retq
+;
+; FMA-LABEL: f16:
+; FMA: # %bb.0: # %entry
+; FMA-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; FMA-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm0 * xmm0) + xmm0
+; FMA-NEXT: retq
+entry:
+ %result = call double @llvm.experimental.constrained.fmuladd.f64(
+ double 42.1,
+ double 42.1,
+ double 42.1,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ ret double %result
+}
; Verify that fma(3.5) isn't simplified when the rounding mode is
; unknown.
@@ -65,5 +397,11 @@
attributes #0 = { strictfp }
+declare float @llvm.experimental.constrained.fmul.f32(float, float, metadata, metadata)
+declare float @llvm.experimental.constrained.fadd.f32(float, float, metadata, metadata)
+declare double @llvm.experimental.constrained.fmul.f64(double, double, metadata, metadata)
+declare double @llvm.experimental.constrained.fadd.f64(double, double, metadata, metadata)
declare float @llvm.experimental.constrained.fma.f32(float, float, float, metadata, metadata)
declare double @llvm.experimental.constrained.fma.f64(double, double, double, metadata, metadata)
+declare float @llvm.experimental.constrained.fmuladd.f32(float, float, float, metadata, metadata)
+declare double @llvm.experimental.constrained.fmuladd.f64(double, double, double, metadata, metadata)
Index: llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
===================================================================
--- llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -535,8 +535,20 @@
[(X86strict_Fmadd node:$src1, node:$src2, node:$src3),
(X86Fmadd node:$src1, node:$src2, node:$src3)]>;
def X86Fnmadd : SDNode<"X86ISD::FNMADD", SDTFPTernaryOp, [SDNPCommutative]>;
+def X86strict_Fnmadd : SDNode<"X86ISD::STRICT_FNMADD", SDTFPTernaryOp, [SDNPCommutative, SDNPHasChain]>;
+def X86any_Fnmadd : PatFrags<(ops node:$src1, node:$src2, node:$src3),
+ [(X86strict_Fnmadd node:$src1, node:$src2, node:$src3),
+ (X86Fnmadd node:$src1, node:$src2, node:$src3)]>;
def X86Fmsub : SDNode<"X86ISD::FMSUB", SDTFPTernaryOp, [SDNPCommutative]>;
+def X86strict_Fmsub : SDNode<"X86ISD::STRICT_FMSUB", SDTFPTernaryOp, [SDNPCommutative, SDNPHasChain]>;
+def X86any_Fmsub : PatFrags<(ops node:$src1, node:$src2, node:$src3),
+ [(X86strict_Fmsub node:$src1, node:$src2, node:$src3),
+ (X86Fmsub node:$src1, node:$src2, node:$src3)]>;
def X86Fnmsub : SDNode<"X86ISD::FNMSUB", SDTFPTernaryOp, [SDNPCommutative]>;
+def X86strict_Fnmsub : SDNode<"X86ISD::STRICT_FNMSUB", SDTFPTernaryOp, [SDNPCommutative, SDNPHasChain]>;
+def X86any_Fnmsub : PatFrags<(ops node:$src1, node:$src2, node:$src3),
+ [(X86strict_Fnmsub node:$src1, node:$src2, node:$src3),
+ (X86Fnmsub node:$src1, node:$src2, node:$src3)]>;
def X86Fmaddsub : SDNode<"X86ISD::FMADDSUB", SDTFPTernaryOp, [SDNPCommutative]>;
def X86Fmsubadd : SDNode<"X86ISD::FMSUBADD", SDTFPTernaryOp, [SDNPCommutative]>;
Index: llvm/lib/Target/X86/X86InstrFMA.td
===================================================================
--- llvm/lib/Target/X86/X86InstrFMA.td
+++ llvm/lib/Target/X86/X86InstrFMA.td
@@ -126,7 +126,7 @@
loadv4f32, loadv8f32, X86any_Fmadd, v4f32, v8f32,
SchedWriteFMA>;
defm VFMSUB : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps", "PS",
- loadv4f32, loadv8f32, X86Fmsub, v4f32, v8f32,
+ loadv4f32, loadv8f32, X86any_Fmsub, v4f32, v8f32,
SchedWriteFMA>;
defm VFMADDSUB : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "ps", "PS",
loadv4f32, loadv8f32, X86Fmaddsub, v4f32, v8f32,
@@ -141,7 +141,7 @@
loadv2f64, loadv4f64, X86any_Fmadd, v2f64,
v4f64, SchedWriteFMA>, VEX_W;
defm VFMSUB : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "pd", "PD",
- loadv2f64, loadv4f64, X86Fmsub, v2f64,
+ loadv2f64, loadv4f64, X86any_Fmsub, v2f64,
v4f64, SchedWriteFMA>, VEX_W;
defm VFMADDSUB : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "pd", "PD",
loadv2f64, loadv4f64, X86Fmaddsub,
@@ -154,15 +154,15 @@
// Fused Negative Multiply-Add
let ExeDomain = SSEPackedSingle in {
defm VFNMADD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "ps", "PS", loadv4f32,
- loadv8f32, X86Fnmadd, v4f32, v8f32, SchedWriteFMA>;
+ loadv8f32, X86any_Fnmadd, v4f32, v8f32, SchedWriteFMA>;
defm VFNMSUB : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "ps", "PS", loadv4f32,
- loadv8f32, X86Fnmsub, v4f32, v8f32, SchedWriteFMA>;
+ loadv8f32, X86any_Fnmsub, v4f32, v8f32, SchedWriteFMA>;
}
let ExeDomain = SSEPackedDouble in {
defm VFNMADD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "pd", "PD", loadv2f64,
- loadv4f64, X86Fnmadd, v2f64, v4f64, SchedWriteFMA>, VEX_W;
+ loadv4f64, X86any_Fnmadd, v2f64, v4f64, SchedWriteFMA>, VEX_W;
defm VFNMSUB : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "pd", "PD", loadv2f64,
- loadv4f64, X86Fnmsub, v2f64, v4f64, SchedWriteFMA>, VEX_W;
+ loadv4f64, X86any_Fnmsub, v2f64, v4f64, SchedWriteFMA>, VEX_W;
}
// All source register operands of FMA opcodes defined in fma3s_rm multiclass
@@ -321,12 +321,12 @@
defm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", X86any_Fmadd,
SchedWriteFMA.Scl>, VEX_LIG;
-defm VFMSUB : fma3s<0x9B, 0xAB, 0xBB, "vfmsub", X86Fmsub,
+defm VFMSUB : fma3s<0x9B, 0xAB, 0xBB, "vfmsub", X86any_Fmsub,
SchedWriteFMA.Scl>, VEX_LIG;
-defm VFNMADD : fma3s<0x9D, 0xAD, 0xBD, "vfnmadd", X86Fnmadd,
+defm VFNMADD : fma3s<0x9D, 0xAD, 0xBD, "vfnmadd", X86any_Fnmadd,
SchedWriteFMA.Scl>, VEX_LIG;
-defm VFNMSUB : fma3s<0x9F, 0xAF, 0xBF, "vfnmsub", X86Fnmsub,
+defm VFNMSUB : fma3s<0x9F, 0xAF, 0xBF, "vfnmsub", X86any_Fnmsub,
SchedWriteFMA.Scl>, VEX_LIG;
multiclass scalar_fma_patterns<SDNode Op, string Prefix, string Suffix,
@@ -373,14 +373,14 @@
}
defm : scalar_fma_patterns<X86any_Fmadd, "VFMADD", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
-defm : scalar_fma_patterns<X86Fmsub, "VFMSUB", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
-defm : scalar_fma_patterns<X86Fnmadd, "VFNMADD", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
-defm : scalar_fma_patterns<X86Fnmsub, "VFNMSUB", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
+defm : scalar_fma_patterns<X86any_Fmsub, "VFMSUB", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
+defm : scalar_fma_patterns<X86any_Fnmadd, "VFNMADD", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
+defm : scalar_fma_patterns<X86any_Fnmsub, "VFNMSUB", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
defm : scalar_fma_patterns<X86any_Fmadd, "VFMADD", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
-defm : scalar_fma_patterns<X86Fmsub, "VFMSUB", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
-defm : scalar_fma_patterns<X86Fnmadd, "VFNMADD", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
-defm : scalar_fma_patterns<X86Fnmsub, "VFNMSUB", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
+defm : scalar_fma_patterns<X86any_Fmsub, "VFMSUB", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
+defm : scalar_fma_patterns<X86any_Fnmadd, "VFNMADD", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
+defm : scalar_fma_patterns<X86any_Fnmsub, "VFNMSUB", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
//===----------------------------------------------------------------------===//
// FMA4 - AMD 4 operand Fused Multiply-Add instructions
@@ -542,26 +542,26 @@
SchedWriteFMA.Scl>,
fma4s_int<0x6A, "vfmaddss", ssmem, v4f32,
SchedWriteFMA.Scl>;
- defm VFMSUBSS4 : fma4s<0x6E, "vfmsubss", FR32, f32mem, f32, X86Fmsub, loadf32,
+ defm VFMSUBSS4 : fma4s<0x6E, "vfmsubss", FR32, f32mem, f32, X86any_Fmsub, loadf32,
SchedWriteFMA.Scl>,
fma4s_int<0x6E, "vfmsubss", ssmem, v4f32,
SchedWriteFMA.Scl>;
defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss", FR32, f32mem, f32,
- X86Fnmadd, loadf32, SchedWriteFMA.Scl>,
+ X86any_Fnmadd, loadf32, SchedWriteFMA.Scl>,
fma4s_int<0x7A, "vfnmaddss", ssmem, v4f32,
SchedWriteFMA.Scl>;
defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss", FR32, f32mem, f32,
- X86Fnmsub, loadf32, SchedWriteFMA.Scl>,
+ X86any_Fnmsub, loadf32, SchedWriteFMA.Scl>,
fma4s_int<0x7E, "vfnmsubss", ssmem, v4f32,
SchedWriteFMA.Scl>;
// Packed Instructions
defm VFMADDPS4 : fma4p<0x68, "vfmaddps", X86any_Fmadd, v4f32, v8f32,
loadv4f32, loadv8f32, SchedWriteFMA>;
- defm VFMSUBPS4 : fma4p<0x6C, "vfmsubps", X86Fmsub, v4f32, v8f32,
+ defm VFMSUBPS4 : fma4p<0x6C, "vfmsubps", X86any_Fmsub, v4f32, v8f32,
loadv4f32, loadv8f32, SchedWriteFMA>;
- defm VFNMADDPS4 : fma4p<0x78, "vfnmaddps", X86Fnmadd, v4f32, v8f32,
+ defm VFNMADDPS4 : fma4p<0x78, "vfnmaddps", X86any_Fnmadd, v4f32, v8f32,
loadv4f32, loadv8f32, SchedWriteFMA>;
- defm VFNMSUBPS4 : fma4p<0x7C, "vfnmsubps", X86Fnmsub, v4f32, v8f32,
+ defm VFNMSUBPS4 : fma4p<0x7C, "vfnmsubps", X86any_Fnmsub, v4f32, v8f32,
loadv4f32, loadv8f32, SchedWriteFMA>;
defm VFMADDSUBPS4 : fma4p<0x5C, "vfmaddsubps", X86Fmaddsub, v4f32, v8f32,
loadv4f32, loadv8f32, SchedWriteFMA>;
@@ -575,26 +575,26 @@
SchedWriteFMA.Scl>,
fma4s_int<0x6B, "vfmaddsd", sdmem, v2f64,
SchedWriteFMA.Scl>;
- defm VFMSUBSD4 : fma4s<0x6F, "vfmsubsd", FR64, f64mem, f64, X86Fmsub, loadf64,
+ defm VFMSUBSD4 : fma4s<0x6F, "vfmsubsd", FR64, f64mem, f64, X86any_Fmsub, loadf64,
SchedWriteFMA.Scl>,
fma4s_int<0x6F, "vfmsubsd", sdmem, v2f64,
SchedWriteFMA.Scl>;
defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd", FR64, f64mem, f64,
- X86Fnmadd, loadf64, SchedWriteFMA.Scl>,
+ X86any_Fnmadd, loadf64, SchedWriteFMA.Scl>,
fma4s_int<0x7B, "vfnmaddsd", sdmem, v2f64,
SchedWriteFMA.Scl>;
defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", FR64, f64mem, f64,
- X86Fnmsub, loadf64, SchedWriteFMA.Scl>,
+ X86any_Fnmsub, loadf64, SchedWriteFMA.Scl>,
fma4s_int<0x7F, "vfnmsubsd", sdmem, v2f64,
SchedWriteFMA.Scl>;
// Packed Instructions
defm VFMADDPD4 : fma4p<0x69, "vfmaddpd", X86any_Fmadd, v2f64, v4f64,
loadv2f64, loadv4f64, SchedWriteFMA>;
- defm VFMSUBPD4 : fma4p<0x6D, "vfmsubpd", X86Fmsub, v2f64, v4f64,
+ defm VFMSUBPD4 : fma4p<0x6D, "vfmsubpd", X86any_Fmsub, v2f64, v4f64,
loadv2f64, loadv4f64, SchedWriteFMA>;
- defm VFNMADDPD4 : fma4p<0x79, "vfnmaddpd", X86Fnmadd, v2f64, v4f64,
+ defm VFNMADDPD4 : fma4p<0x79, "vfnmaddpd", X86any_Fnmadd, v2f64, v4f64,
loadv2f64, loadv4f64, SchedWriteFMA>;
- defm VFNMSUBPD4 : fma4p<0x7D, "vfnmsubpd", X86Fnmsub, v2f64, v4f64,
+ defm VFNMSUBPD4 : fma4p<0x7D, "vfnmsubpd", X86any_Fnmsub, v2f64, v4f64,
loadv2f64, loadv4f64, SchedWriteFMA>;
defm VFMADDSUBPD4 : fma4p<0x5D, "vfmaddsubpd", X86Fmaddsub, v2f64, v4f64,
loadv2f64, loadv4f64, SchedWriteFMA>;
@@ -630,11 +630,11 @@
}
defm : scalar_fma4_patterns<X86any_Fmadd, "VFMADDSS4", v4f32, f32, FR32, loadf32>;
-defm : scalar_fma4_patterns<X86Fmsub, "VFMSUBSS4", v4f32, f32, FR32, loadf32>;
-defm : scalar_fma4_patterns<X86Fnmadd, "VFNMADDSS4", v4f32, f32, FR32, loadf32>;
-defm : scalar_fma4_patterns<X86Fnmsub, "VFNMSUBSS4", v4f32, f32, FR32, loadf32>;
+defm : scalar_fma4_patterns<X86any_Fmsub, "VFMSUBSS4", v4f32, f32, FR32, loadf32>;
+defm : scalar_fma4_patterns<X86any_Fnmadd, "VFNMADDSS4", v4f32, f32, FR32, loadf32>;
+defm : scalar_fma4_patterns<X86any_Fnmsub, "VFNMSUBSS4", v4f32, f32, FR32, loadf32>;
defm : scalar_fma4_patterns<X86any_Fmadd, "VFMADDSD4", v2f64, f64, FR64, loadf64>;
-defm : scalar_fma4_patterns<X86Fmsub, "VFMSUBSD4", v2f64, f64, FR64, loadf64>;
-defm : scalar_fma4_patterns<X86Fnmadd, "VFNMADDSD4", v2f64, f64, FR64, loadf64>;
-defm : scalar_fma4_patterns<X86Fnmsub, "VFNMSUBSD4", v2f64, f64, FR64, loadf64>;
+defm : scalar_fma4_patterns<X86any_Fmsub, "VFMSUBSD4", v2f64, f64, FR64, loadf64>;
+defm : scalar_fma4_patterns<X86any_Fnmadd, "VFNMADDSD4", v2f64, f64, FR64, loadf64>;
+defm : scalar_fma4_patterns<X86any_Fnmsub, "VFNMSUBSD4", v2f64, f64, FR64, loadf64>;
Index: llvm/lib/Target/X86/X86InstrAVX512.td
===================================================================
--- llvm/lib/Target/X86/X86InstrAVX512.td
+++ llvm/lib/Target/X86/X86InstrAVX512.td
@@ -6487,11 +6487,11 @@
}
defm VFMADD213 : avx512_fma3p_213_f<0xA8, "vfmadd213", X86any_Fmadd, X86FmaddRnd>;
-defm VFMSUB213 : avx512_fma3p_213_f<0xAA, "vfmsub213", X86Fmsub, X86FmsubRnd>;
+defm VFMSUB213 : avx512_fma3p_213_f<0xAA, "vfmsub213", X86any_Fmsub, X86FmsubRnd>;
defm VFMADDSUB213 : avx512_fma3p_213_f<0xA6, "vfmaddsub213", X86Fmaddsub, X86FmaddsubRnd>;
defm VFMSUBADD213 : avx512_fma3p_213_f<0xA7, "vfmsubadd213", X86Fmsubadd, X86FmsubaddRnd>;
-defm VFNMADD213 : avx512_fma3p_213_f<0xAC, "vfnmadd213", X86Fnmadd, X86FnmaddRnd>;
-defm VFNMSUB213 : avx512_fma3p_213_f<0xAE, "vfnmsub213", X86Fnmsub, X86FnmsubRnd>;
+defm VFNMADD213 : avx512_fma3p_213_f<0xAC, "vfnmadd213", X86any_Fnmadd, X86FnmaddRnd>;
+defm VFNMSUB213 : avx512_fma3p_213_f<0xAE, "vfnmsub213", X86any_Fnmsub, X86FnmsubRnd>;
multiclass avx512_fma3p_231_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -6565,11 +6565,11 @@
}
defm VFMADD231 : avx512_fma3p_231_f<0xB8, "vfmadd231", X86any_Fmadd, X86FmaddRnd>;
-defm VFMSUB231 : avx512_fma3p_231_f<0xBA, "vfmsub231", X86Fmsub, X86FmsubRnd>;
+defm VFMSUB231 : avx512_fma3p_231_f<0xBA, "vfmsub231", X86any_Fmsub, X86FmsubRnd>;
defm VFMADDSUB231 : avx512_fma3p_231_f<0xB6, "vfmaddsub231", X86Fmaddsub, X86FmaddsubRnd>;
defm VFMSUBADD231 : avx512_fma3p_231_f<0xB7, "vfmsubadd231", X86Fmsubadd, X86FmsubaddRnd>;
-defm VFNMADD231 : avx512_fma3p_231_f<0xBC, "vfnmadd231", X86Fnmadd, X86FnmaddRnd>;
-defm VFNMSUB231 : avx512_fma3p_231_f<0xBE, "vfnmsub231", X86Fnmsub, X86FnmsubRnd>;
+defm VFNMADD231 : avx512_fma3p_231_f<0xBC, "vfnmadd231", X86any_Fnmadd, X86FnmaddRnd>;
+defm VFNMSUB231 : avx512_fma3p_231_f<0xBE, "vfnmsub231", X86any_Fnmsub, X86FnmsubRnd>;
multiclass avx512_fma3p_132_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
X86FoldableSchedWrite sched,
@@ -6645,11 +6645,11 @@
}
defm VFMADD132 : avx512_fma3p_132_f<0x98, "vfmadd132", X86any_Fmadd, X86FmaddRnd>;
-defm VFMSUB132 : avx512_fma3p_132_f<0x9A, "vfmsub132", X86Fmsub, X86FmsubRnd>;
+defm VFMSUB132 : avx512_fma3p_132_f<0x9A, "vfmsub132", X86any_Fmsub, X86FmsubRnd>;
defm VFMADDSUB132 : avx512_fma3p_132_f<0x96, "vfmaddsub132", X86Fmaddsub, X86FmaddsubRnd>;
defm VFMSUBADD132 : avx512_fma3p_132_f<0x97, "vfmsubadd132", X86Fmsubadd, X86FmsubaddRnd>;
-defm VFNMADD132 : avx512_fma3p_132_f<0x9C, "vfnmadd132", X86Fnmadd, X86FnmaddRnd>;
-defm VFNMSUB132 : avx512_fma3p_132_f<0x9E, "vfnmsub132", X86Fnmsub, X86FnmsubRnd>;
+defm VFNMADD132 : avx512_fma3p_132_f<0x9C, "vfnmadd132", X86any_Fnmadd, X86FnmaddRnd>;
+defm VFNMSUB132 : avx512_fma3p_132_f<0x9E, "vfnmsub132", X86any_Fnmsub, X86FnmsubRnd>;
// Scalar FMA
multiclass avx512_fma3s_common<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
@@ -6742,9 +6742,9 @@
}
defm VFMADD : avx512_fma3s<0xA9, 0xB9, 0x99, "vfmadd", X86any_Fmadd, X86FmaddRnd>;
-defm VFMSUB : avx512_fma3s<0xAB, 0xBB, 0x9B, "vfmsub", X86Fmsub, X86FmsubRnd>;
-defm VFNMADD : avx512_fma3s<0xAD, 0xBD, 0x9D, "vfnmadd", X86Fnmadd, X86FnmaddRnd>;
-defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86Fnmsub, X86FnmsubRnd>;
+defm VFMSUB : avx512_fma3s<0xAB, 0xBB, 0x9B, "vfmsub", X86any_Fmsub, X86FmsubRnd>;
+defm VFNMADD : avx512_fma3s<0xAD, 0xBD, 0x9D, "vfnmadd", X86any_Fnmadd, X86FnmaddRnd>;
+defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86any_Fnmsub, X86FnmsubRnd>;
multiclass avx512_scalar_fma_patterns<SDNode Op, SDNode RndOp, string Prefix,
string Suffix, SDNode Move,
@@ -6950,20 +6950,20 @@
defm : avx512_scalar_fma_patterns<X86any_Fmadd, X86FmaddRnd, "VFMADD", "SS",
X86Movss, v4f32x_info, fp32imm0>;
-defm : avx512_scalar_fma_patterns<X86Fmsub, X86FmsubRnd, "VFMSUB", "SS",
+defm : avx512_scalar_fma_patterns<X86any_Fmsub, X86FmsubRnd, "VFMSUB", "SS",
X86Movss, v4f32x_info, fp32imm0>;
-defm : avx512_scalar_fma_patterns<X86Fnmadd, X86FnmaddRnd, "VFNMADD", "SS",
+defm : avx512_scalar_fma_patterns<X86any_Fnmadd, X86FnmaddRnd, "VFNMADD", "SS",
X86Movss, v4f32x_info, fp32imm0>;
-defm : avx512_scalar_fma_patterns<X86Fnmsub, X86FnmsubRnd, "VFNMSUB", "SS",
+defm : avx512_scalar_fma_patterns<X86any_Fnmsub, X86FnmsubRnd, "VFNMSUB", "SS",
X86Movss, v4f32x_info, fp32imm0>;
defm : avx512_scalar_fma_patterns<X86any_Fmadd, X86FmaddRnd, "VFMADD", "SD",
X86Movsd, v2f64x_info, fp64imm0>;
-defm : avx512_scalar_fma_patterns<X86Fmsub, X86FmsubRnd, "VFMSUB", "SD",
+defm : avx512_scalar_fma_patterns<X86any_Fmsub, X86FmsubRnd, "VFMSUB", "SD",
X86Movsd, v2f64x_info, fp64imm0>;
-defm : avx512_scalar_fma_patterns<X86Fnmadd, X86FnmaddRnd, "VFNMADD", "SD",
+defm : avx512_scalar_fma_patterns<X86any_Fnmadd, X86FnmaddRnd, "VFNMADD", "SD",
X86Movsd, v2f64x_info, fp64imm0>;
-defm : avx512_scalar_fma_patterns<X86Fnmsub, X86FnmsubRnd, "VFNMSUB", "SD",
+defm : avx512_scalar_fma_patterns<X86any_Fnmsub, X86FnmsubRnd, "VFNMSUB", "SD",
X86Movsd, v2f64x_info, fp64imm0>;
//===----------------------------------------------------------------------===//
Index: llvm/lib/Target/X86/X86ISelLowering.h
===================================================================
--- llvm/lib/Target/X86/X86ISelLowering.h
+++ llvm/lib/Target/X86/X86ISelLowering.h
@@ -468,9 +468,9 @@
// FMA nodes.
// We use the target independent ISD::FMA for the non-inverted case.
- FNMADD,
- FMSUB,
- FNMSUB,
+ FNMADD, STRICT_FNMADD,
+ FMSUB, STRICT_FMSUB,
+ FNMSUB, STRICT_FNMSUB,
FMADDSUB,
FMSUBADD,
Index: llvm/lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- llvm/lib/Target/X86/X86ISelLowering.cpp
+++ llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2000,6 +2000,7 @@
setTargetDAGCombine(ISD::FSUB);
setTargetDAGCombine(ISD::FNEG);
setTargetDAGCombine(ISD::FMA);
+ setTargetDAGCombine(ISD::STRICT_FMA);
setTargetDAGCombine(ISD::FMINNUM);
setTargetDAGCombine(ISD::FMAXNUM);
setTargetDAGCombine(ISD::SUB);
@@ -29817,8 +29818,11 @@
case X86ISD::VPCOMU: return "X86ISD::VPCOMU";
case X86ISD::VPERMIL2: return "X86ISD::VPERMIL2";
case X86ISD::FMSUB: return "X86ISD::FMSUB";
+ case X86ISD::STRICT_FMSUB: return "X86ISD::STRICT_FMSUB";
case X86ISD::FNMADD: return "X86ISD::FNMADD";
+ case X86ISD::STRICT_FNMADD: return "X86ISD::STRICT_FNMADD";
case X86ISD::FNMSUB: return "X86ISD::FNMSUB";
+ case X86ISD::STRICT_FNMSUB: return "X86ISD::STRICT_FNMSUB";
case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB";
case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD";
case X86ISD::FMADD_RND: return "X86ISD::FMADD_RND";
@@ -42514,37 +42518,46 @@
if (NegMul) {
switch (Opcode) {
default: llvm_unreachable("Unexpected opcode");
- case ISD::FMA: Opcode = X86ISD::FNMADD; break;
- case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
- case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
- case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
- case X86ISD::FNMADD: Opcode = ISD::FMA; break;
- case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
- case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
- case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
+ case ISD::FMA: Opcode = X86ISD::FNMADD; break;
+ case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FNMADD; break;
+ case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
+ case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
+ case X86ISD::STRICT_FMSUB: Opcode = X86ISD::STRICT_FNMSUB; break;
+ case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
+ case X86ISD::FNMADD: Opcode = ISD::FMA; break;
+ case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA; break;
+ case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
+ case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
+ case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB; break;
+ case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
}
}
if (NegAcc) {
switch (Opcode) {
default: llvm_unreachable("Unexpected opcode");
- case ISD::FMA: Opcode = X86ISD::FMSUB; break;
- case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
- case X86ISD::FMSUB: Opcode = ISD::FMA; break;
- case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
- case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
- case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
- case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
- case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
- case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break;
- case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;
- case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break;
- case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;
+ case ISD::FMA: Opcode = X86ISD::FMSUB; break;
+ case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FMSUB; break;
+ case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
+ case X86ISD::FMSUB: Opcode = ISD::FMA; break;
+ case X86ISD::STRICT_FMSUB: Opcode = ISD::STRICT_FMA; break;
+ case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
+ case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
+ case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break;
+ case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
+ case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
+ case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break;
+ case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
+ case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break;
+ case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;
+ case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break;
+ case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;
}
}
if (NegRes) {
switch (Opcode) {
+ // For accuracy reason, we never combine fneg and fma under strict FP.
default: llvm_unreachable("Unexpected opcode");
case ISD::FMA: Opcode = X86ISD::FNMSUB; break;
case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
@@ -43516,6 +43529,7 @@
const X86Subtarget &Subtarget) {
SDLoc dl(N);
EVT VT = N->getValueType(0);
+ bool IsStrict = N->isStrictFPOpcode();
// Let legalize expand this if it isn't a legal type yet.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -43526,9 +43540,9 @@
if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
return SDValue();
- SDValue A = N->getOperand(0);
- SDValue B = N->getOperand(1);
- SDValue C = N->getOperand(2);
+ SDValue A = N->getOperand(IsStrict ? 1 : 0);
+ SDValue B = N->getOperand(IsStrict ? 2 : 1);
+ SDValue C = N->getOperand(IsStrict ? 3 : 2);
auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
@@ -43566,9 +43580,15 @@
unsigned NewOpcode =
negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);
- if (N->getNumOperands() == 4)
- return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
- return DAG.getNode(NewOpcode, dl, VT, A, B, C);
+ if (IsStrict) {
+ assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4");
+ return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},
+ {N->getOperand(0), A, B, C});
+ } else {
+ if (N->getNumOperands() == 4)
+ return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
+ return DAG.getNode(NewOpcode, dl, VT, A, B, C);
+ }
}
// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
@@ -46071,12 +46091,16 @@
case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
case X86ISD::FMADD_RND:
case X86ISD::FMSUB:
+ case X86ISD::STRICT_FMSUB:
case X86ISD::FMSUB_RND:
case X86ISD::FNMADD:
+ case X86ISD::STRICT_FNMADD:
case X86ISD::FNMADD_RND:
case X86ISD::FNMSUB:
+ case X86ISD::STRICT_FNMSUB:
case X86ISD::FNMSUB_RND:
- case ISD::FMA: return combineFMA(N, DAG, DCI, Subtarget);
+ case ISD::FMA:
+ case ISD::STRICT_FMA: return combineFMA(N, DAG, DCI, Subtarget);
case X86ISD::FMADDSUB_RND:
case X86ISD::FMSUBADD_RND:
case X86ISD::FMADDSUB:
Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
===================================================================
--- llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -7059,31 +7059,53 @@
}
SDVTList VTs = DAG.getVTList(ValueVTs);
- SDValue Result = DAG.getNode(Opcode, sdl, VTs, Opers);
-
- assert(Result.getNode()->getNumValues() == 2);
-
- // Push node to the appropriate list so that future instructions can be
- // chained up correctly.
- SDValue OutChain = Result.getValue(1);
- switch (FPI.getExceptionBehavior().getValue()) {
- case fp::ExceptionBehavior::ebIgnore:
- // The only reason why ebIgnore nodes still need to be chained is that
- // they might depend on the current rounding mode, and therefore must
- // not be moved across instruction that may change that mode.
- LLVM_FALLTHROUGH;
- case fp::ExceptionBehavior::ebMayTrap:
- // These must not be moved across calls or instructions that may change
- // floating-point exception masks.
- PendingConstrainedFP.push_back(OutChain);
- break;
- case fp::ExceptionBehavior::ebStrict:
- // These must not be moved across calls or instructions that may change
- // floating-point exception masks or read floating-point exception flags.
- // In addition, they cannot be optimized out even if unused.
- PendingConstrainedFPStrict.push_back(OutChain);
- break;
- }
+ SDValue Result;
+
+ auto pushOutChain = [&]() {
+ assert(Result.getNode()->getNumValues() == 2);
+
+ // Push node to the appropriate list so that future instructions can be
+ // chained up correctly.
+ SDValue OutChain = Result.getValue(1);
+ switch (FPI.getExceptionBehavior().getValue()) {
+ case fp::ExceptionBehavior::ebIgnore:
+ // The only reason why ebIgnore nodes still need to be chained is that
+ // they might depend on the current rounding mode, and therefore must
+ // not be moved across instruction that may change that mode.
+ LLVM_FALLTHROUGH;
+ case fp::ExceptionBehavior::ebMayTrap:
+ // These must not be moved across calls or instructions that may change
+ // floating-point exception masks.
+ PendingConstrainedFP.push_back(OutChain);
+ break;
+ case fp::ExceptionBehavior::ebStrict:
+ // These must not be moved across calls or instructions that may change
+ // floating-point exception masks or read floating-point exception flags.
+ // In addition, they cannot be optimized out even if unused.
+ PendingConstrainedFPStrict.push_back(OutChain);
+ break;
+ }
+ };
+
+ if (Opcode == ISD::STRICT_FMULADD) {
+ Opcode = ISD::STRICT_FMA;
+ // Break fmuladd into fmul and fadd.
+ if (TM.Options.AllowFPOpFusion == FPOpFusion::Strict ||
+ !TLI.isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(),
+ ValueVTs[0])) {
+ Opers.pop_back();
+ Result = DAG.getNode(ISD::STRICT_FMUL, sdl, VTs, Opers);
+ pushOutChain();
+ Opcode = ISD::STRICT_FADD;
+ Opers.clear();
+ Opers.push_back(Result.getValue(1));
+ Opers.push_back(Result.getValue(0));
+ Opers.push_back(getValue(FPI.getArgOperand(2)));
+ }
+ }
+
+ Result = DAG.getNode(Opcode, sdl, VTs, Opers);
+ pushOutChain();
SDValue FPResult = Result.getValue(0);
setValue(&FPI, FPResult);
Index: llvm/include/llvm/IR/Intrinsics.td
===================================================================
--- llvm/include/llvm/IR/Intrinsics.td
+++ llvm/include/llvm/IR/Intrinsics.td
@@ -626,6 +626,13 @@
llvm_metadata_ty,
llvm_metadata_ty ]>;
+ def int_experimental_constrained_fmuladd : Intrinsic<[ llvm_anyfloat_ty ],
+ [ LLVMMatchType<0>,
+ LLVMMatchType<0>,
+ LLVMMatchType<0>,
+ llvm_metadata_ty,
+ llvm_metadata_ty ]>;
+
def int_experimental_constrained_fptosi : Intrinsic<[ llvm_anyint_ty ],
[ llvm_anyfloat_ty,
llvm_metadata_ty ]>;
Index: llvm/include/llvm/IR/ConstrainedOps.def
===================================================================
--- llvm/include/llvm/IR/ConstrainedOps.def
+++ llvm/include/llvm/IR/ConstrainedOps.def
@@ -81,6 +81,10 @@
FUNCTION(sqrt, 1, 1, experimental_constrained_sqrt, FSQRT)
FUNCTION(trunc, 1, 0, experimental_constrained_trunc, FTRUNC)
+// This is definition for fmuladd intrinsic function, that is converted into
+// constrained FMA or FMUL + FADD intrinsics.
+FUNCTION(fmuladd, 3, 1, experimental_constrained_fmuladd, FMULADD)
+
#undef INSTRUCTION
#undef FUNCTION
#undef CMP_INSTRUCTION
Index: llvm/include/llvm/CodeGen/ISDOpcodes.h
===================================================================
--- llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -352,6 +352,10 @@
/// comparison operation.
STRICT_FSETCC, STRICT_FSETCCS,
+ /// FMULADD/STRICT_FMULADD - A intermediate node, made functions handle
+ /// constrained fmuladd the same as other constrained intrinsics.
+ FMULADD, STRICT_FMULADD,
+
/// FMA - Perform a * b + c with no intermediate rounding step.
FMA,
Index: llvm/include/llvm/CodeGen/BasicTTIImpl.h
===================================================================
--- llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1286,6 +1286,9 @@
case Intrinsic::fmuladd:
ISDs.push_back(ISD::FMA);
break;
+ case Intrinsic::experimental_constrained_fmuladd:
+ ISDs.push_back(ISD::STRICT_FMA);
+ break;
// FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free.
case Intrinsic::lifetime_start:
case Intrinsic::lifetime_end:
@@ -1509,6 +1512,13 @@
if (IID == Intrinsic::fmuladd)
return ConcreteTTI->getArithmeticInstrCost(BinaryOperator::FMul, RetTy) +
ConcreteTTI->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy);
+ // FIXME: Is constrained intrinsic' cost equal to it's no strict one?
+ if (IID == Intrinsic::experimental_constrained_fmuladd)
+ return ConcreteTTI->getIntrinsicCost(
+ Intrinsic::experimental_constrained_fmul, RetTy, Tys,
+ nullptr) +
+ ConcreteTTI->getIntrinsicCost(
+ Intrinsic::experimental_constrained_fadd, RetTy, Tys, nullptr);
// Else, assume that we need to scalarize this intrinsic. For math builtins
// this will emit a costly libcall, adding call overhead and spills. Make it
Index: llvm/docs/LangRef.rst
===================================================================
--- llvm/docs/LangRef.rst
+++ llvm/docs/LangRef.rst
@@ -16061,6 +16061,63 @@
performed by '``llvm.experimental.constrained.fcmps``' will raise an
exception if either operand is a NAN (QNAN or SNAN).
+'``llvm.experimental.constrained.fmuladd``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+ declare <type>
+ @llvm.experimental.constrained.fmuladd(<type> <op1>, <type> <op2>,
+ <type> <op3>,
+ metadata <rounding mode>,
+ metadata <exception behavior>)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.constrained.fmuladd``' intrinsic represents
+multiply-add expressions that can be fused if the code generator determines
+that (a) the target instruction set has support for a fused operation,
+and (b) that the fused operation is more efficient than the equivalent,
+separate pair of mul and add instructions.
+
+Arguments:
+""""""""""
+
+The first three arguments to the '``llvm.experimental.constrained.fmuladd``'
+intrinsic must be floating-point or vector of floating-point values.
+All three arguments must have identical types.
+
+The fourth and fifth arguments specifie the exception behavior as described
+above.
+
+Semantics:
+""""""""""
+
+The expression:
+
+::
+
+ %0 = call float @llvm.experimental.constrained.fmuladd.f32(%a, %b, %c)
+
+is equivalent to the expression:
+
+::
+
+ %0 = call float @llvm.experimental.constrained.fmul.f32(%a, %b)
+ %1 = call float @llvm.experimental.constrained.fadd.f32(%0, %c)
+
+except that it is unspecified whether rounding will be performed between the
+multiplication and addition steps. Fusion is not guaranteed, even if the target
+platform supports it.
+If a fused multiply-add is required, the corresponding
+:ref:`llvm.experimental.constrained.fma <int_fma>` intrinsic function should be
+used instead.
+This never sets errno, just as '``llvm.experimental.constrained.fma.*``'.
+
Constrained libm-equivalent Intrinsics
--------------------------------------
Index: clang/test/CodeGen/constrained-math-builtins.c
===================================================================
--- clang/test/CodeGen/constrained-math-builtins.c
+++ clang/test/CodeGen/constrained-math-builtins.c
@@ -148,3 +148,13 @@
// CHECK: declare x86_fp80 @llvm.experimental.constrained.trunc.f80(x86_fp80, metadata)
};
+#pragma STDC FP_CONTRACT ON
+void bar(float f) {
+ f * f + f;
+ (double)f * f + f;
+ (long double)f * f + f;
+
+// CHECK: declare float @llvm.experimental.constrained.fmuladd.f32(float, float, float, metadata, metadata)
+// CHECK: declare double @llvm.experimental.constrained.fmuladd.f64(double, double, double, metadata, metadata)
+// CHECK: declare x86_fp80 @llvm.experimental.constrained.fmuladd.f80(x86_fp80, x86_fp80, x86_fp80, metadata, metadata)
+};
Index: clang/lib/CodeGen/CGExprScalar.cpp
===================================================================
--- clang/lib/CodeGen/CGExprScalar.cpp
+++ clang/lib/CodeGen/CGExprScalar.cpp
@@ -3361,7 +3361,7 @@
// the add operand respectively. This allows fmuladd to represent a*b-c, or
// c-a*b. Patterns in LLVM should catch the negated forms and translate them to
// efficient operations.
-static Value* buildFMulAdd(llvm::BinaryOperator *MulOp, Value *Addend,
+static Value* buildFMulAdd(llvm::Instruction *MulOp, Value *Addend,
const CodeGenFunction &CGF, CGBuilderTy &Builder,
bool negMul, bool negAdd) {
assert(!(negMul && negAdd) && "Only one of negMul and negAdd should be set.");
@@ -3373,12 +3373,19 @@
if (negAdd)
Addend = Builder.CreateFNeg(Addend, "neg");
- Value *FMulAdd = Builder.CreateCall(
- CGF.CGM.getIntrinsic(llvm::Intrinsic::fmuladd, Addend->getType()),
- {MulOp0, MulOp1, Addend});
- MulOp->eraseFromParent();
+ Value *FMulAdd = nullptr;
+ if (Builder.getIsFPConstrained())
+ FMulAdd = Builder.CreateCall(
+ CGF.CGM.getIntrinsic(llvm::Intrinsic::experimental_constrained_fmuladd,
+ Addend->getType()),
+ {MulOp0, MulOp1, Addend, MulOp->getOperand(2), MulOp->getOperand(3)});
+ else
+ FMulAdd = Builder.CreateCall(
+ CGF.CGM.getIntrinsic(llvm::Intrinsic::fmuladd, Addend->getType()),
+ {MulOp0, MulOp1, Addend});
+ MulOp->eraseFromParent();
- return FMulAdd;
+ return FMulAdd;
}
// Check whether it would be legal to emit an fmuladd intrinsic call to
@@ -3413,6 +3420,21 @@
return buildFMulAdd(RHSBinOp, op.LHS, CGF, Builder, isSub, false);
}
+ if (Builder.getIsFPConstrained()) {
+ if (auto *LHSBinOp = dyn_cast<llvm::CallBase>(op.LHS)) {
+ if (LHSBinOp->getIntrinsicID() ==
+ llvm::Intrinsic::experimental_constrained_fmul &&
+ LHSBinOp->use_empty())
+ return buildFMulAdd(LHSBinOp, op.RHS, CGF, Builder, false, isSub);
+ }
+ if (auto *RHSBinOp = dyn_cast<llvm::CallBase>(op.RHS)) {
+ if (RHSBinOp->getIntrinsicID() ==
+ llvm::Intrinsic::experimental_constrained_fmul &&
+ RHSBinOp->use_empty())
+ return buildFMulAdd(RHSBinOp, op.LHS, CGF, Builder, isSub, false);
+ }
+ }
+
return nullptr;
}
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits