[PATCH] D116015: [PowerPC] Add generic fnmsub intrinsic

Qiu Chaofan via Phabricator via cfe-commits Sun, 19 Dec 2021 20:49:10 -0800

qiucf created this revision.
qiucf added reviewers: rzurob, jsji, nemanjai, shchenz, PowerPC.
Herald added subscribers: kbarton, hiraditya.
qiucf requested review of this revision.
Herald added projects: clang, LLVM.
Herald added subscribers: llvm-commits, cfe-commits.


Currently in Clang, we have various builtins for `fnmsub` operation:

- `__builtin_vsx_xvnmsubasp`/`__builtin_vsx_xvnmsubadp` for float/double 
vector, they'll be transformed into `-fma(a, b, -c)` in LLVM IR
- `__builtin_ppc_fnmsubs`/`__builtin_ppc_fnmsub` for float/double scalar, 
they'll generate corresponding intrinsic in IR

But for the vector version of builtin, the 3 op chain may be recognized as 
expensive by some passes (like early cse). We need some way to keep the fnmsub 
form until code generation.

This new intrinsics, `llvm.ppc.nmsub.*`, can replace previous 
`llvm.ppc.fnmsub(s)` intrinsic. But now we've not enabled VSX FMA mutation 
pass, and `int_ppc_fnmsub(s)` generates M-form, we can replace them after 
fixing the pass.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D116015

Files:
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/test/CodeGen/builtins-ppc-fma.c
  clang/test/CodeGen/builtins-ppc-fpconstrained.c
  clang/test/CodeGen/builtins-ppc-vsx.c
  llvm/include/llvm/IR/IntrinsicsPowerPC.td
  llvm/lib/Target/PowerPC/PPCISelLowering.cpp
  llvm/test/CodeGen/PowerPC/builtins-ppc-xlcompat-math.ll

Index: llvm/test/CodeGen/PowerPC/builtins-ppc-xlcompat-math.ll
===================================================================
--- llvm/test/CodeGen/PowerPC/builtins-ppc-xlcompat-math.ll
+++ llvm/test/CodeGen/PowerPC/builtins-ppc-xlcompat-math.ll
@@ -142,6 +142,105 @@
 
 declare float @llvm.ppc.fnmsubs(float, float, float)
 
+define dso_local float @nmsub_f32(float %f, float %f2, float %f3) {
+; CHECK-PWR8-LABEL: nmsub_f32:
+; CHECK-PWR8:       # %bb.0: # %entry
+; CHECK-PWR8-NEXT:    xsnmsubasp 3, 1, 2
+; CHECK-PWR8-NEXT:    fmr 1, 3
+; CHECK-PWR8-NEXT:    blr
+;
+; CHECK-NOVSX-LABEL: nmsub_f32:
+; CHECK-NOVSX:       # %bb.0: # %entry
+; CHECK-NOVSX-NEXT:    fnmsubs 1, 1, 2, 3
+; CHECK-NOVSX-NEXT:    blr
+;
+; CHECK-PWR7-LABEL: nmsub_f32:
+; CHECK-PWR7:       # %bb.0: # %entry
+; CHECK-PWR7-NEXT:    fnmsubs 1, 1, 2, 3
+; CHECK-PWR7-NEXT:    blr
+entry:
+  %0 = tail call float @llvm.ppc.nmsub.f32(float %f, float %f2, float %f3)
+  ret float %0
+}
+
+declare float @llvm.ppc.nmsub.f32(float, float, float)
+
+define dso_local double @nmsub_f64(double %f, double %f2, double %f3) {
+; CHECK-PWR8-LABEL: nmsub_f64:
+; CHECK-PWR8:       # %bb.0: # %entry
+; CHECK-PWR8-NEXT:    xsnmsubadp 3, 1, 2
+; CHECK-PWR8-NEXT:    fmr 1, 3
+; CHECK-PWR8-NEXT:    blr
+;
+; CHECK-NOVSX-LABEL: nmsub_f64:
+; CHECK-NOVSX:       # %bb.0: # %entry
+; CHECK-NOVSX-NEXT:    fnmsub 1, 1, 2, 3
+; CHECK-NOVSX-NEXT:    blr
+;
+; CHECK-PWR7-LABEL: nmsub_f64:
+; CHECK-PWR7:       # %bb.0: # %entry
+; CHECK-PWR7-NEXT:    xsnmsubadp 3, 1, 2
+; CHECK-PWR7-NEXT:    fmr 1, 3
+; CHECK-PWR7-NEXT:    blr
+entry:
+  %0 = tail call double @llvm.ppc.nmsub.f64(double %f, double %f2, double %f3)
+  ret double %0
+}
+
+declare double @llvm.ppc.nmsub.f64(double, double, double)
+
+define dso_local <4 x float> @nmsub_v4f32(<4 x float> %f, <4 x float> %f2, <4 x float> %f3) {
+; CHECK-PWR8-LABEL: nmsub_v4f32:
+; CHECK-PWR8:       # %bb.0: # %entry
+; CHECK-PWR8-NEXT:    xvnmsubasp 36, 34, 35
+; CHECK-PWR8-NEXT:    vmr 2, 4
+; CHECK-PWR8-NEXT:    blr
+;
+; CHECK-NOVSX-LABEL: nmsub_v4f32:
+; CHECK-NOVSX:       # %bb.0: # %entry
+; CHECK-NOVSX-NEXT:    fnmsubs 1, 1, 5, 9
+; CHECK-NOVSX-NEXT:    fnmsubs 2, 2, 6, 10
+; CHECK-NOVSX-NEXT:    fnmsubs 3, 3, 7, 11
+; CHECK-NOVSX-NEXT:    fnmsubs 4, 4, 8, 12
+; CHECK-NOVSX-NEXT:    blr
+;
+; CHECK-PWR7-LABEL: nmsub_v4f32:
+; CHECK-PWR7:       # %bb.0: # %entry
+; CHECK-PWR7-NEXT:    xvnmsubasp 36, 34, 35
+; CHECK-PWR7-NEXT:    vmr 2, 4
+; CHECK-PWR7-NEXT:    blr
+entry:
+  %0 = tail call <4 x float> @llvm.ppc.nmsub.v4f32(<4 x float> %f, <4 x float> %f2, <4 x float> %f3)
+  ret <4 x float> %0
+}
+
+declare <4 x float> @llvm.ppc.nmsub.v4f32(<4 x float>, <4 x float>, <4 x float>)
+
+define dso_local <2 x double> @nmsub_v2f64(<2 x double> %f, <2 x double> %f2, <2 x double> %f3) {
+; CHECK-PWR8-LABEL: nmsub_v2f64:
+; CHECK-PWR8:       # %bb.0: # %entry
+; CHECK-PWR8-NEXT:    xvnmsubadp 36, 34, 35
+; CHECK-PWR8-NEXT:    vmr 2, 4
+; CHECK-PWR8-NEXT:    blr
+;
+; CHECK-NOVSX-LABEL: nmsub_v2f64:
+; CHECK-NOVSX:       # %bb.0: # %entry
+; CHECK-NOVSX-NEXT:    fnmsub 1, 1, 3, 5
+; CHECK-NOVSX-NEXT:    fnmsub 2, 2, 4, 6
+; CHECK-NOVSX-NEXT:    blr
+;
+; CHECK-PWR7-LABEL: nmsub_v2f64:
+; CHECK-PWR7:       # %bb.0: # %entry
+; CHECK-PWR7-NEXT:    xvnmsubadp 36, 34, 35
+; CHECK-PWR7-NEXT:    vmr 2, 4
+; CHECK-PWR7-NEXT:    blr
+entry:
+  %0 = tail call <2 x double> @llvm.ppc.nmsub.v2f64(<2 x double> %f, <2 x double> %f2, <2 x double> %f3)
+  ret <2 x double> %0
+}
+
+declare <2 x double> @llvm.ppc.nmsub.v2f64(<2 x double>, <2 x double>, <2 x double>)
+
 define dso_local double @fre(double %d) {
 ; CHECK-PWR8-LABEL: fre:
 ; CHECK-PWR8:       # %bb.0: # %entry
Index: llvm/lib/Target/PowerPC/PPCISelLowering.cpp
===================================================================
--- llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -603,6 +603,8 @@
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f64, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::ppcf128, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f64, Custom);
 
   // To handle counter-based loop conditions.
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i1, Custom);
@@ -10509,6 +10511,16 @@
              DAG.getTargetConstant(PPC::PRED_EQ, dl, MVT::i32)}),
         0);
   }
+  case Intrinsic::ppc_nmsub: {
+    EVT VT = Op.getOperand(1).getValueType();
+    if (!Subtarget.hasVSX() || (!Subtarget.hasFloat128() && VT == MVT::f128))
+      return DAG.getNode(
+          ISD::FNEG, dl, VT,
+          DAG.getNode(ISD::FMA, dl, VT, Op.getOperand(1), Op.getOperand(2),
+                      DAG.getNode(ISD::FNEG, dl, VT, Op.getOperand(3))));
+    return DAG.getNode(PPCISD::FNMSUB, dl, VT, Op.getOperand(1),
+                       Op.getOperand(2), Op.getOperand(3));
+  }
   case Intrinsic::ppc_convert_f128_to_ppcf128:
   case Intrinsic::ppc_convert_ppcf128_to_f128: {
     RTLIB::Libcall LC = IntrinsicID == Intrinsic::ppc_convert_ppcf128_to_f128
@@ -11154,6 +11166,7 @@
       Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::ppcf128,
                                     N->getOperand(2), N->getOperand(1)));
       break;
+    case Intrinsic::ppc_nmsub:
     case Intrinsic::ppc_convert_f128_to_ppcf128:
       Results.push_back(LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), DAG));
       break;
Index: llvm/include/llvm/IR/IntrinsicsPowerPC.td
===================================================================
--- llvm/include/llvm/IR/IntrinsicsPowerPC.td
+++ llvm/include/llvm/IR/IntrinsicsPowerPC.td
@@ -1731,6 +1731,10 @@
         Intrinsic <[llvm_float_ty],
                    [llvm_float_ty, llvm_float_ty, llvm_float_ty],
                    [IntrNoMem]>;
+  def int_ppc_nmsub
+      : Intrinsic<[llvm_anyfloat_ty],
+                  [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
+                  [IntrNoMem]>;
   def int_ppc_fre
       : GCCBuiltin<"__builtin_ppc_fre">,
         Intrinsic <[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
Index: clang/test/CodeGen/builtins-ppc-vsx.c
===================================================================
--- clang/test/CodeGen/builtins-ppc-vsx.c
+++ clang/test/CodeGen/builtins-ppc-vsx.c
@@ -894,20 +894,12 @@
 // CHECK-LE-NEXT: fneg <2 x double> %[[FM]]
 
   res_vf = vec_nmsub(vf, vf, vf);
-// CHECK: fneg <4 x float> %{{[0-9]+}}
-// CHECK-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float>
-// CHECK: fneg <4 x float> %{{[0-9]+}}
-// CHECK-LE: fneg <4 x float> %{{[0-9]+}}
-// CHECK-LE-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float>
-// CHECK-LE: fneg <4 x float> %{{[0-9]+}}
+// CHECK: call <4 x float> @llvm.ppc.nmsub.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float>
+// CHECK-LE: call <4 x float> @llvm.ppc.nmsub.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float>
 
   res_vd = vec_nmsub(vd, vd, vd);
-// CHECK: fneg <2 x double> %{{[0-9]+}}
-// CHECK-NEXT: [[FM:[0-9]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double>
-// CHECK-NEXT: fneg <2 x double> %[[FM]]
-// CHECK-LE: fneg <2 x double> %{{[0-9]+}}
-// CHECK-LE-NEXT: [[FM:[0-9]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double>
-// CHECK-LE-NEXT: fneg <2 x double> %[[FM]]
+// CHECK: [[FM:[0-9]+]] = call <2 x double> @llvm.ppc.nmsub.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double>
+// CHECK-LE: [[FM:[0-9]+]] = call <2 x double> @llvm.ppc.nmsub.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double>
 
   /* vec_nor */
   res_vsll = vec_nor(vsll, vsll);
Index: clang/test/CodeGen/builtins-ppc-fpconstrained.c
===================================================================
--- clang/test/CodeGen/builtins-ppc-fpconstrained.c
+++ clang/test/CodeGen/builtins-ppc-fpconstrained.c
@@ -142,9 +142,7 @@
 
   vf = __builtin_vsx_xvnmsubasp(vf, vf, vf);
   // CHECK-LABEL: try-xvnmsubasp
-  // CHECK-UNCONSTRAINED: [[RESULT0:%[^ ]+]] = fneg <4 x float> %{{.*}}
-  // CHECK-UNCONSTRAINED: [[RESULT1:%[^ ]+]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> [[RESULT0]])
-  // CHECK-UNCONSTRAINED: fneg <4 x float> [[RESULT1]]
+  // CHECK-UNCONSTRAINED: call <4 x float> @llvm.ppc.nmsub.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}})
   // CHECK-CONSTRAINED: [[RESULT0:%[^ ]+]] = fneg <4 x float> %{{.*}}
   // CHECK-CONSTRAINED: [[RESULT1:%[^ ]+]] = call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> [[RESULT0]], metadata !"round.tonearest", metadata !"fpexcept.strict")
   // CHECK-CONSTRAINED: fneg <4 x float> [[RESULT1]]
@@ -152,9 +150,7 @@
 
   vd = __builtin_vsx_xvnmsubadp(vd, vd, vd);
   // CHECK-LABEL: try-xvnmsubadp
-  // CHECK-UNCONSTRAINED: [[RESULT0:%[^ ]+]] = fneg <2 x double> %{{.*}}
-  // CHECK-UNCONSTRAINED: [[RESULT1:%[^ ]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> [[RESULT0]])
-  // CHECK-UNCONSTRAINED: fneg <2 x double> [[RESULT1]]
+  // CHECK-UNCONSTRAINED: call <2 x double> @llvm.ppc.nmsub.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}})
   // CHECK-CONSTRAINED: [[RESULT0:%[^ ]+]] = fneg <2 x double> %{{.*}}
   // CHECK-CONSTRAINED: [[RESULT1:%[^ ]+]] = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> [[RESULT0]], metadata !"round.tonearest", metadata !"fpexcept.strict")
   // CHECK-CONSTRAINED: fneg <2 x double> [[RESULT1]]
Index: clang/test/CodeGen/builtins-ppc-fma.c
===================================================================
--- clang/test/CodeGen/builtins-ppc-fma.c
+++ clang/test/CodeGen/builtins-ppc-fma.c
@@ -32,12 +32,8 @@
   // CHECK: <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> [[RESULT]])
 
   vf = __builtin_vsx_xvnmsubasp(vf, vf, vf);
-  // CHECK: [[RESULT:%[^ ]+]] = fneg <4 x float> %{{.*}}
-  // CHECK: [[RESULT2:%[^ ]+]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> [[RESULT]])
-  // CHECK: fneg <4 x float> [[RESULT2]]
+  // CHECK: call <4 x float> @llvm.ppc.nmsub.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}})
 
   vd = __builtin_vsx_xvnmsubadp(vd, vd, vd);
-  // CHECK: [[RESULT:%[^ ]+]] = fneg <2 x double> %{{.*}}
-  // CHECK: [[RESULT2:%[^ ]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> [[RESULT]])
-  // CHECK: fneg <2 x double> [[RESULT2]]
+  // CHECK: call <2 x double> @llvm.ppc.nmsub.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}})
 }
Index: clang/lib/CodeGen/CGBuiltin.cpp
===================================================================
--- clang/lib/CodeGen/CGBuiltin.cpp
+++ clang/lib/CodeGen/CGBuiltin.cpp
@@ -15875,10 +15875,9 @@
                   F, {X, Y, Builder.CreateFNeg(Z, "neg")}),
               "neg");
         else
-          return Builder.CreateFNeg(
-              Builder.CreateCall(F, {X, Y, Builder.CreateFNeg(Z, "neg")}),
-              "neg");
-    }
+          return Builder.CreateCall(
+              CGM.getIntrinsic(Intrinsic::ppc_nmsub, ResultType), {X, Y, Z});
+      }
     llvm_unreachable("Unknown FMA operation");
     return nullptr; // Suppress no-return warning
   }

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D116015: [PowerPC] Add generic fnmsub intrinsic

Reply via email to