[PATCH] D99675: [llvm][clang] Create new intrinsic llvm.arith.fence to control FP optimization at expression level

Melanie Blower via Phabricator via cfe-commits Wed, 26 May 2021 11:58:25 -0700

mibintc updated this revision to Diff 348046.
mibintc retitled this revision from "RFC [llvm][clang] Create new intrinsic 
llvm.arith.fence to control FP optimization at expression level" to 
"[llvm][clang] Create new intrinsic llvm.arith.fence to control FP optimization 
at expression level".
mibintc edited the summary of this revision.
mibintc added a comment.


Rebased to ToT. It fixes the previous illegal type lowering problems. It also 
updates the tests to show the functionality in a better way as well as fixes a 
newly found problem.

Ready for your code review and +1

We think this patch provides basic functionality for the intrinsic, and 
enhancements can be added in future patches.

Thanks!


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D99675/new/

https://reviews.llvm.org/D99675

Files:
  llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
  llvm/include/llvm/CodeGen/BasicTTIImpl.h
  llvm/include/llvm/CodeGen/ISDOpcodes.h
  llvm/include/llvm/CodeGen/SelectionDAGISel.h
  llvm/include/llvm/IR/IRBuilder.h
  llvm/include/llvm/IR/Intrinsics.td
  llvm/include/llvm/Support/TargetOpcodes.def
  llvm/include/llvm/Target/Target.td
  llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
  llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
  llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
  llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
  llvm/test/CodeGen/X86/arithmetic_fence.ll

Index: llvm/test/CodeGen/X86/arithmetic_fence.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/X86/arithmetic_fence.ll
@@ -0,0 +1,161 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+fma | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+fma | FileCheck %s --check-prefix=X64
+
+define float @f1(float %a, float %b, float %c) {
+; X86-LABEL: f1:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT:    vfmadd213ss {{.*#+}} xmm1 = (xmm0 * xmm1) + mem
+; X86-NEXT:    vmovss %xmm1, (%esp)
+; X86-NEXT:    flds (%esp)
+; X86-NEXT:    popl %eax
+; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    retl
+;
+; X64-LABEL: f1:
+; X64:       # %bb.0:
+; X64-NEXT:    vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
+; X64-NEXT:    retq
+  %mul = fmul fast float %b, %a
+  %add = fadd fast float %mul, %c
+  ret float %add
+}
+
+define float @f2(float %a, float %b, float %c) {
+; X86-LABEL: f2:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    vmulss {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X86-NEXT:    #ARITH_FENCE
+; X86-NEXT:    vaddss {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X86-NEXT:    vmovss %xmm0, (%esp)
+; X86-NEXT:    flds (%esp)
+; X86-NEXT:    popl %eax
+; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    retl
+;
+; X64-LABEL: f2:
+; X64:       # %bb.0:
+; X64-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; X64-NEXT:    #ARITH_FENCE
+; X64-NEXT:    vaddss %xmm2, %xmm0, %xmm0
+; X64-NEXT:    retq
+  %mul = fmul fast float %b, %a
+  %tmp = call float @llvm.arithmetic.fence.f32(float %mul)
+  %add = fadd fast float %tmp, %c
+  ret float %add
+}
+
+define double @f3(double %a) {
+; X86-LABEL: f3:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    .cfi_def_cfa_register %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    vmulsd {{\.LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-NEXT:    vmovsd %xmm0, (%esp)
+; X86-NEXT:    fldl (%esp)
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    .cfi_def_cfa %esp, 4
+; X86-NEXT:    retl
+;
+; X64-LABEL: f3:
+; X64:       # %bb.0:
+; X64-NEXT:    vmulsd {{.*}}(%rip), %xmm0, %xmm0
+; X64-NEXT:    retq
+  %1 = fadd fast double %a, %a
+  %2 = fadd fast double %a, %a
+  %3 = fadd fast double %1, %2
+  ret double %3
+}
+
+define double @f4(double %a) {
+; X86-LABEL: f4:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    .cfi_def_cfa_register %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    vaddsd %xmm0, %xmm0, %xmm0
+; X86-NEXT:    vmovapd %xmm0, %xmm1
+; X86-NEXT:    #ARITH_FENCE
+; X86-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmovsd %xmm0, (%esp)
+; X86-NEXT:    fldl (%esp)
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    .cfi_def_cfa %esp, 4
+; X86-NEXT:    retl
+;
+; X64-LABEL: f4:
+; X64:       # %bb.0:
+; X64-NEXT:    vaddsd %xmm0, %xmm0, %xmm0
+; X64-NEXT:    vmovapd %xmm0, %xmm1
+; X64-NEXT:    #ARITH_FENCE
+; X64-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
+; X64-NEXT:    retq
+  %1 = fadd fast double %a, %a
+  %t = call double @llvm.arithmetic.fence.f64(double %1)
+  %2 = fadd fast double %a, %a
+  %3 = fadd fast double %t, %2
+  ret double %3
+}
+
+define <2 x float> @f5(<2 x float> %a) {
+; X86-LABEL: f5:
+; X86:       # %bb.0:
+; X86-NEXT:    vmulps {{\.LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: f5:
+; X64:       # %bb.0:
+; X64-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
+; X64-NEXT:    retq
+  %1 = fadd fast <2 x float> %a, %a
+  %2 = fadd fast <2 x float> %a, %a
+  %3 = fadd fast <2 x float> %1, %2
+  ret <2 x float> %3
+}
+
+define <2 x float> @f6(<2 x float> %a) {
+; X86-LABEL: f6:
+; X86:       # %bb.0:
+; X86-NEXT:    vaddps %xmm0, %xmm0, %xmm0
+; X86-NEXT:    vmovaps %xmm0, %xmm1
+; X86-NEXT:    #ARITH_FENCE
+; X86-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: f6:
+; X64:       # %bb.0:
+; X64-NEXT:    vaddps %xmm0, %xmm0, %xmm0
+; X64-NEXT:    vmovaps %xmm0, %xmm1
+; X64-NEXT:    #ARITH_FENCE
+; X64-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; X64-NEXT:    retq
+  %1 = fadd fast <2 x float> %a, %a
+  %t = call <2 x float> @llvm.arithmetic.fence.v2f32(<2 x float> %1)
+  %2 = fadd fast <2 x float> %a, %a
+  %3 = fadd fast <2 x float> %t, %2
+  ret <2 x float> %3
+}
+
+declare float @llvm.arithmetic.fence.f32(float)
+declare double @llvm.arithmetic.fence.f64(double)
+declare <2 x float> @llvm.arithmetic.fence.v2f32(<2 x float>)
Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
===================================================================
--- llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -2325,6 +2325,11 @@
                        N->getOperand(0));
 }
 
+void SelectionDAGISel::Select_ARITH_FENCE(SDNode *N) {
+  CurDAG->SelectNodeTo(N, TargetOpcode::ARITH_FENCE, N->getValueType(0),
+                       N->getOperand(0));
+}
+
 /// GetVBR - decode a vbr encoding whose top bit is set.
 LLVM_ATTRIBUTE_ALWAYS_INLINE static uint64_t
 GetVBR(uint64_t Val, const unsigned char *MatcherTable, unsigned &Idx) {
@@ -2876,6 +2881,9 @@
   case ISD::FREEZE:
     Select_FREEZE(NodeToMatch);
     return;
+  case ISD::ARITH_FENCE:
+    Select_ARITH_FENCE(NodeToMatch);
+    return;
   }
 
   assert(!NodeToMatch->isMachineOpcode() && "Node already selected!");
Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
===================================================================
--- llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6292,6 +6292,15 @@
                              getValue(I.getArgOperand(0)),
                              getValue(I.getArgOperand(1)), Flags));
     return;
+  case Intrinsic::arithmetic_fence: {
+    auto DL = getCurSDLoc();
+
+    SDValue Val = getValue(I.getArgOperand(0));
+    EVT ResultVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+
+    setValue(&I, DAG.getNode(ISD::ARITH_FENCE, DL, ResultVT, Val));
+    return;
+  }
   case Intrinsic::fma:
     setValue(&I, DAG.getNode(
                      ISD::FMA, sdl, getValue(I.getArgOperand(0)).getValueType(),
Index: llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
===================================================================
--- llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -3146,6 +3146,7 @@
   case ISD::CTTZ_ZERO_UNDEF:
   case ISD::FNEG:
   case ISD::FREEZE:
+  case ISD::ARITH_FENCE:
   case ISD::FCANONICALIZE:
     Res = WidenVecRes_Unary(N);
     break;
Index: llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
===================================================================
--- llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -1328,6 +1328,9 @@
       case TargetOpcode::PSEUDO_PROBE:
         emitPseudoProbe(MI);
         break;
+      case TargetOpcode::ARITH_FENCE:
+        OutStreamer->emitRawComment("ARITH_FENCE");
+        break;
       default:
         emitInstruction(&MI);
         if (CanDoExtraAnalysis) {
Index: llvm/include/llvm/Target/Target.td
===================================================================
--- llvm/include/llvm/Target/Target.td
+++ llvm/include/llvm/Target/Target.td
@@ -1170,6 +1170,13 @@
   let AsmString = "PSEUDO_PROBE";
   let hasSideEffects = 1;
 }
+def ARITH_FENCE : StandardPseudoInstruction {
+  let OutOperandList = (outs unknown:$dst);
+  let InOperandList = (ins unknown:$src);
+  let AsmString = "";
+  let hasSideEffects = false;
+  let Constraints = "$src = $dst";
+}
 
 def STACKMAP : StandardPseudoInstruction {
   let OutOperandList = (outs);
Index: llvm/include/llvm/Support/TargetOpcodes.def
===================================================================
--- llvm/include/llvm/Support/TargetOpcodes.def
+++ llvm/include/llvm/Support/TargetOpcodes.def
@@ -117,6 +117,9 @@
 /// Pseudo probe
 HANDLE_TARGET_OPCODE(PSEUDO_PROBE)
 
+/// Arithmetic fence.
+HANDLE_TARGET_OPCODE(ARITH_FENCE)
+
 /// A Stackmap instruction captures the location of live variables at its
 /// position in the instruction stream. It is followed by a shadow of bytes
 /// that must lie within the function and not contain another stackmap.
Index: llvm/include/llvm/IR/Intrinsics.td
===================================================================
--- llvm/include/llvm/IR/Intrinsics.td
+++ llvm/include/llvm/IR/Intrinsics.td
@@ -1327,6 +1327,10 @@
 def int_pseudoprobe : Intrinsic<[], [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_i64_ty],
                                     [IntrInaccessibleMemOnly, IntrWillReturn]>;
 
+// Arithmetic fence intrinsic.
+def int_arithmetic_fence : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+
+ // Intrinsics to support half precision floating point format
 // Intrinsics to support half precision floating point format
 let IntrProperties = [IntrNoMem, IntrWillReturn] in {
 def int_convert_to_fp16   : DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_anyfloat_ty]>;
Index: llvm/include/llvm/IR/IRBuilder.h
===================================================================
--- llvm/include/llvm/IR/IRBuilder.h
+++ llvm/include/llvm/IR/IRBuilder.h
@@ -897,6 +897,13 @@
     return CreateBinaryIntrinsic(Intrinsic::maximum, LHS, RHS, nullptr, Name);
   }
 
+  /// Create a call to the arithmetic_fence intrinsic.
+  CallInst *CreateArithmeticFence(Value *Val, Type *DstType,
+                                  const Twine &Name = "") {
+    return CreateIntrinsic(Intrinsic::arithmetic_fence, {DstType}, {Val}, nullptr,
+                           Name);
+  }
+
   /// Create a call to the experimental.vector.extract intrinsic.
   CallInst *CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx,
                                 const Twine &Name = "") {
Index: llvm/include/llvm/CodeGen/SelectionDAGISel.h
===================================================================
--- llvm/include/llvm/CodeGen/SelectionDAGISel.h
+++ llvm/include/llvm/CodeGen/SelectionDAGISel.h
@@ -318,6 +318,7 @@
   void CannotYetSelect(SDNode *N);
 
   void Select_FREEZE(SDNode *N);
+  void Select_ARITH_FENCE(SDNode *N);
 
 private:
   void DoInstructionSelection();
Index: llvm/include/llvm/CodeGen/ISDOpcodes.h
===================================================================
--- llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -1090,6 +1090,10 @@
   /// specifier.
   PREFETCH,
 
+  /// ARITH_FENCE - This corresponds to a arithmetic fence intrinsic. Both its
+  /// operand and output are the same floating type.
+  ARITH_FENCE,
+
   /// OUTCHAIN = ATOMIC_FENCE(INCHAIN, ordering, scope)
   /// This corresponds to the fence instruction. It takes an input chain, and
   /// two integer constants: an AtomicOrdering and a SynchronizationScope.
Index: llvm/include/llvm/CodeGen/BasicTTIImpl.h
===================================================================
--- llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1605,6 +1605,7 @@
     case Intrinsic::lifetime_end:
     case Intrinsic::sideeffect:
     case Intrinsic::pseudoprobe:
+    case Intrinsic::arithmetic_fence:
       return 0;
     case Intrinsic::masked_store: {
       Type *Ty = Tys[0];
Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -576,6 +576,7 @@
     case Intrinsic::assume:
     case Intrinsic::sideeffect:
     case Intrinsic::pseudoprobe:
+    case Intrinsic::arithmetic_fence:
     case Intrinsic::dbg_declare:
     case Intrinsic::dbg_value:
     case Intrinsic::dbg_label:

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D99675: [llvm][clang] Create new intrinsic llvm.arith.fence to control FP optimization at expression level

Reply via email to