https://github.com/kumarak updated https://github.com/llvm/llvm-project/pull/180883
>From e485e21f06a9a0d7650e5bdc093b423b802d39b1 Mon Sep 17 00:00:00 2001 From: kumarak <[email protected]> Date: Tue, 10 Feb 2026 23:28:50 +0000 Subject: [PATCH] [CodeGen] Preserve constant-time semantics in nested CTSELECT DAG combines - Fix DAG combine to preserve constant-time properties in nested CTSELECT patterns - Improve legalization handling for CTSELECT with AND/OR merging optimizations - Remove redundant tests from X86/RISCV and add new tests for vector types - Maintain all critical code path coverage while reducing test maintenance burden --- llvm/include/llvm/CodeGen/SelectionDAGNodes.h | 2 +- llvm/include/llvm/CodeGen/TargetLowering.h | 2 +- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 108 +- llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 56 +- .../SelectionDAG/LegalizeFloatTypes.cpp | 18 +- .../SelectionDAG/LegalizeIntegerTypes.cpp | 2 +- llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h | 2 + .../SelectionDAG/LegalizeTypesGeneric.cpp | 14 +- .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 1 - llvm/test/CodeGen/RISCV/ctselect-fallback.ll | 698 +++++++-- llvm/test/CodeGen/X86/ctselect.ll | 1268 ++++++++++++----- 11 files changed, 1571 insertions(+), 600 deletions(-) diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h index fdb76a93bc5bb..aa72e81b2ab54 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h +++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h @@ -435,7 +435,6 @@ struct SDNodeFlags { NonNeg | NoNaNs | NoInfs | SameSign | InBounds, FastMathFlags = NoNaNs | NoInfs | NoSignedZeros | AllowReciprocal | AllowContract | ApproximateFuncs | AllowReassociation, - }; /// Default constructor turns off all optimization flags. @@ -487,6 +486,7 @@ struct SDNodeFlags { bool hasNoFPExcept() const { return Flags & NoFPExcept; } bool hasUnpredictable() const { return Flags & Unpredictable; } bool hasInBounds() const { return Flags & InBounds; } + bool operator==(const SDNodeFlags &Other) const { return Flags == Other.Flags; } diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 084e08e76bd2e..724a69bd26861 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -245,7 +245,7 @@ class LLVM_ABI TargetLoweringBase { ScalarValSelect, // The target supports scalar selects (ex: cmov). ScalarCondVectorVal, // The target supports selects with a scalar condition // and vector values (ex: cmov). - VectorMaskSelect, // The target supports vector selects with a vector + VectorMaskSelect // The target supports vector selects with a vector // mask (ex: x86 blends). }; diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 646bc5e78c051..620a79727278b 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -6050,7 +6050,6 @@ static SDValue isSaturatingMinMax(SDValue N0, SDValue N1, SDValue N2, N0CC = cast<CondCodeSDNode>(N0.getOperand(4))->get(); break; case ISD::SELECT: - case ISD::CTSELECT: case ISD::VSELECT: if (N0.getOperand(0).getOpcode() != ISD::SETCC) return SDValue(); @@ -12219,8 +12218,7 @@ template <class MatchContextClass> static SDValue foldBoolSelectToLogic(SDNode *N, const SDLoc &DL, SelectionDAG &DAG) { assert((N->getOpcode() == ISD::SELECT || N->getOpcode() == ISD::VSELECT || - N->getOpcode() == ISD::VP_SELECT || - N->getOpcode() == ISD::CTSELECT) && + N->getOpcode() == ISD::VP_SELECT) && "Expected a (v)(vp.)(ct) select"); SDValue Cond = N->getOperand(0); SDValue T = N->getOperand(1), F = N->getOperand(2); @@ -12583,6 +12581,12 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) { return SDValue(); } +// Keep CTSELECT combines deliberately conservative to preserve constant-time +// intent across generic DAG combines. We only accept: +// - canonicalization of negated conditions (flip true/false operands), and +// - i1 CTSELECT nesting merges via AND/OR that keep the result as CTSELECT. +// Broader rewrites should be done in target-specific lowering when stronger +// guarantees about legality and constant-time preservation are available. SDValue DAGCombiner::visitConstantTimeSelect(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -12592,10 +12596,10 @@ SDValue DAGCombiner::visitConstantTimeSelect(SDNode *N) { SDLoc DL(N); SDNodeFlags Flags = N->getFlags(); - if (SDValue V = foldBoolSelectToLogic<EmptyMatchContext>(N, DL, DAG)) - return V; - // ctselect (not Cond), N1, N2 -> ctselect Cond, N2, N1 + // This is a CT-safe canonicalization: flip negated condition by swapping + // arms. extractBooleanFlip only matches boolean xor-with-1, so this preserves + // dataflow semantics and does not introduce data-dependent control flow. if (SDValue F = extractBooleanFlip(N0, DAG, TLI, false)) { SDValue SelectOp = DAG.getNode(ISD::CTSELECT, DL, VT, F, N2, N1); SelectOp->setFlags(Flags); @@ -12603,82 +12607,46 @@ SDValue DAGCombiner::visitConstantTimeSelect(SDNode *N) { } if (VT0 == MVT::i1) { - // The code in this block deals with the following 2 equivalences: - // select(C0|C1, x, y) <=> select(C0, x, select(C1, x, y)) - // select(C0&C1, x, y) <=> select(C0, select(C1, x, y), y) - // The target can specify its preferred form with the - // shouldNormalizeToSelectSequence() callback. However we always transform - // to the right anyway if we find the inner select exists in the DAG anyway - // and we always transform to the left side if we know that we can further - // optimize the combination of the conditions. - bool normalizeToSequence = - TLI.shouldNormalizeToSelectSequence(*DAG.getContext(), VT); - // ctselect (and Cond0, Cond1), X, Y - // -> ctselect Cond0, (ctselect Cond1, X, Y), Y - if (N0->getOpcode() == ISD::AND && N0->hasOneUse()) { - SDValue Cond0 = N0->getOperand(0); - SDValue Cond1 = N0->getOperand(1); - SDValue InnerSelect = DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(), - Cond1, N1, N2, Flags); - if (normalizeToSequence || !InnerSelect.use_empty()) - return DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(), Cond0, - InnerSelect, N2, Flags); - // Cleanup on failure. - if (InnerSelect.use_empty()) - recursivelyDeleteUnusedNodes(InnerSelect.getNode()); - } - // ctselect (or Cond0, Cond1), X, Y -> ctselect Cond0, X, (ctselect Cond1, - // X, Y) - if (N0->getOpcode() == ISD::OR && N0->hasOneUse()) { - SDValue Cond0 = N0->getOperand(0); - SDValue Cond1 = N0->getOperand(1); - SDValue InnerSelect = DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(), - Cond1, N1, N2, Flags); - if (normalizeToSequence || !InnerSelect.use_empty()) - return DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(), Cond0, N1, - InnerSelect, Flags); - // Cleanup on failure. - if (InnerSelect.use_empty()) - recursivelyDeleteUnusedNodes(InnerSelect.getNode()); - } - - // ctselect Cond0, (ctselect Cond1, X, Y), Y -> ctselect (and Cond0, Cond1), - // X, Y + // Nested CTSELECT merging optimizations for i1 conditions. + // These are CT-safe because: + // 1. AND/OR are bitwise operations that execute in constant time + // 2. The optimization combines two sequential CTSELECTs into one, + // reducing + // the total number of constant-time operations without changing + // semantics + // 3. No data-dependent branches or memory accesses are introduced + // + // Pattern 1: ctselect C0, (ctselect C1, X, Y), Y -> ctselect (C0 & C1), X, + // Y + // Semantic equivalence: If C0 is true, evaluate inner select (C1 ? X : + // Y). If C0 is false, choose Y. This is equivalent to (C0 && C1) ? X : Y. if (N1->getOpcode() == ISD::CTSELECT && N1->hasOneUse()) { SDValue N1_0 = N1->getOperand(0); SDValue N1_1 = N1->getOperand(1); SDValue N1_2 = N1->getOperand(2); if (N1_2 == N2 && N0.getValueType() == N1_0.getValueType()) { - // Create the actual and node if we can generate good code for it. - if (!normalizeToSequence) { - SDValue And = DAG.getNode(ISD::AND, DL, N0.getValueType(), N0, N1_0); - return DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(), And, N1_1, - N2, Flags); - } - // Otherwise see if we can optimize the "and" to a better pattern. - if (SDValue Combined = visitANDLike(N0, N1_0, N)) { - return DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(), Combined, - N1_1, N2, Flags); - } + SDValue And = DAG.getNode(ISD::AND, DL, N0.getValueType(), N0, N1_0); + SDValue SelectOp = + DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(), And, N1_1, N2); + SelectOp->setFlags(Flags); + return SelectOp; } } - // ctselect Cond0, X, (ctselect Cond1, X, Y) -> ctselect (or Cond0, Cond1), - // X, Y + + // Pattern 2: ctselect C0, X, (ctselect C1, X, Y) -> ctselect (C0 | C1), X, + // Y + // Semantic equivalence: If C0 is true, choose X. If C0 is false, evaluate + // inner select (C1 ? X : Y). This is equivalent to (C0 || C1) ? X : Y. if (N2->getOpcode() == ISD::CTSELECT && N2->hasOneUse()) { SDValue N2_0 = N2->getOperand(0); SDValue N2_1 = N2->getOperand(1); SDValue N2_2 = N2->getOperand(2); if (N2_1 == N1 && N0.getValueType() == N2_0.getValueType()) { - // Create the actual or node if we can generate good code for it. - if (!normalizeToSequence) { - SDValue Or = DAG.getNode(ISD::OR, DL, N0.getValueType(), N0, N2_0); - return DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(), Or, N1, N2_2, - Flags); - } - // Otherwise see if we can optimize to a better pattern. - if (SDValue Combined = visitORLike(N0, N2_0, DL)) - return DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(), Combined, N1, - N2_2, Flags); + SDValue Or = DAG.getNode(ISD::OR, DL, N0.getValueType(), N0, N2_0); + SDValue SelectOp = + DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(), Or, N1, N2_2); + SelectOp->setFlags(Flags); + return SelectOp; } } } diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index ae5a47fa74844..f9c0456651c07 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -4249,19 +4249,51 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { Tmp3 = Node->getOperand(2); EVT VT = Tmp2.getValueType(); if (VT.isVector()) { - SmallVector<SDValue> Elements; - unsigned NumElements = VT.getVectorNumElements(); - EVT ScalarVT = VT.getScalarType(); - for (unsigned Idx = 0; Idx < NumElements; ++Idx) { - SDValue IdxVal = DAG.getConstant(Idx, dl, MVT::i64); - SDValue TVal = - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, Tmp2, IdxVal); - SDValue FVal = - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, Tmp3, IdxVal); - Elements.push_back( - DAG.getCTSelect(dl, ScalarVT, Tmp1, TVal, FVal, Node->getFlags())); + // Constant-time vector blending using pattern F ^ ((T ^ F) & Mask) + // where Mask = broadcast(i1 ? -1 : 0) to match vector element width. + // + // This formulation uses only XOR and AND operations, avoiding branches + // that would leak timing information. It's equivalent to: + // Mask==0xFF: F ^ ((T ^ F) & 0xFF) = F ^ (T ^ F) = T + // Mask==0x00: F ^ ((T ^ F) & 0x00) = F ^ 0 = F + + EVT IntVT = VT; + SDValue T = Tmp2; // True value + SDValue F = Tmp3; // False value + + // Step 1: Handle floating-point vectors by bitcasting to integer + if (VT.isFloatingPoint()) { + IntVT = EVT::getVectorVT( + *DAG.getContext(), + EVT::getIntegerVT(*DAG.getContext(), VT.getScalarSizeInBits()), + VT.getVectorElementCount()); + T = DAG.getNode(ISD::BITCAST, dl, IntVT, T); + F = DAG.getNode(ISD::BITCAST, dl, IntVT, F); } - Tmp1 = DAG.getBuildVector(VT, dl, Elements); + + // Step 2: Broadcast the i1 condition to a vector of i1s + // Creates [cond, cond, cond, ...] with i1 elements + EVT VecI1Ty = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + VT.getVectorNumElements()); + SDValue VecCond = DAG.getSplatBuildVector(VecI1Ty, dl, Tmp1); + + // Step 3: Sign-extend i1 vector to get all-bits mask + // true (i1=1) -> 0xFFFFFFFF..., false (i1=0) -> 0x00000000 + // Sign extension is constant-time: pure arithmetic, no branches + SDValue Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, IntVT, VecCond); + + // Step 4: Compute constant-time blend: F ^ ((T ^ F) & Mask) + // All operations (XOR, AND) execute in constant time + SDValue TXorF = DAG.getNode(ISD::XOR, dl, IntVT, T, F); + SDValue MaskedDiff = DAG.getNode(ISD::AND, dl, IntVT, TXorF, Mask); + Tmp1 = DAG.getNode(ISD::XOR, dl, IntVT, F, MaskedDiff); + + // Step 5: Bitcast back to original floating-point type if needed + if (VT.isFloatingPoint()) { + Tmp1 = DAG.getNode(ISD::BITCAST, dl, VT, Tmp1); + } + + Tmp1->setFlags(Node->getFlags()); } else if (VT.isFloatingPoint()) { EVT IntegerVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits()); Tmp2 = DAG.getBitcast(IntegerVT, Tmp2); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp index 6be66663e5c74..1f2d50e8af8d5 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp @@ -1572,7 +1572,7 @@ void DAGTypeLegalizer::ExpandFloatResult(SDNode *N, unsigned ResNo) { case ISD::POISON: case ISD::UNDEF: SplitRes_UNDEF(N, Lo, Hi); break; case ISD::SELECT: SplitRes_Select(N, Lo, Hi); break; - case ISD::CTSELECT: SplitRes_Select(N, Lo, Hi); break; + case ISD::CTSELECT: SplitRes_CTSELECT(N, Lo, Hi); break; case ISD::SELECT_CC: SplitRes_SELECT_CC(N, Lo, Hi); break; case ISD::MERGE_VALUES: ExpandRes_MERGE_VALUES(N, ResNo, Lo, Hi); break; @@ -2930,7 +2930,7 @@ void DAGTypeLegalizer::PromoteFloatResult(SDNode *N, unsigned ResNo) { break; case ISD::SELECT: R = PromoteFloatRes_SELECT(N); break; case ISD::CTSELECT: - R = PromoteFloatRes_SELECT(N); + R = PromoteFloatRes_CTSELECT(N); break; case ISD::SELECT_CC: R = PromoteFloatRes_SELECT_CC(N); break; @@ -3238,6 +3238,11 @@ SDValue DAGTypeLegalizer::PromoteFloatRes_SELECT(SDNode *N) { N->getOperand(0), TrueVal, FalseVal); } +SDValue DAGTypeLegalizer::PromoteFloatRes_CTSELECT(SDNode *N) { + // Keep CTSELECT behavior aligned with SELECT promotion logic. + return PromoteFloatRes_SELECT(N); +} + // Construct a new SELECT_CC node with the promoted true- and false- values. // The operands used for comparison are promoted by PromoteFloatOp_SELECT_CC. SDValue DAGTypeLegalizer::PromoteFloatRes_SELECT_CC(SDNode *N) { @@ -3419,7 +3424,7 @@ void DAGTypeLegalizer::SoftPromoteHalfResult(SDNode *N, unsigned ResNo) { break; case ISD::SELECT: R = SoftPromoteHalfRes_SELECT(N); break; case ISD::CTSELECT: - R = SoftPromoteHalfRes_SELECT(N); + R = SoftPromoteHalfRes_CTSELECT(N); break; case ISD::SELECT_CC: R = SoftPromoteHalfRes_SELECT_CC(N); break; case ISD::STRICT_SINT_TO_FP: @@ -3665,6 +3670,13 @@ SDValue DAGTypeLegalizer::SoftPromoteHalfRes_SELECT(SDNode *N) { Op2); } +SDValue DAGTypeLegalizer::SoftPromoteHalfRes_CTSELECT(SDNode *N) { + SDValue Op1 = GetSoftPromotedHalf(N->getOperand(1)); + SDValue Op2 = GetSoftPromotedHalf(N->getOperand(2)); + return DAG.getCTSelect(SDLoc(N), Op1.getValueType(), N->getOperand(0), Op1, + Op2); +} + SDValue DAGTypeLegalizer::SoftPromoteHalfRes_SELECT_CC(SDNode *N) { SDValue Op2 = GetSoftPromotedHalf(N->getOperand(2)); SDValue Op3 = GetSoftPromotedHalf(N->getOperand(3)); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index ac20f8a009600..2318ed7834dc0 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -3006,7 +3006,7 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) { case ISD::MERGE_VALUES: SplitRes_MERGE_VALUES(N, ResNo, Lo, Hi); break; case ISD::SELECT: SplitRes_Select(N, Lo, Hi); break; case ISD::CTSELECT: - SplitRes_Select(N, Lo, Hi); + SplitRes_CTSELECT(N, Lo, Hi); break; case ISD::SELECT_CC: SplitRes_SELECT_CC(N, Lo, Hi); break; case ISD::POISON: diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index 62069b4fb03a3..33784418db499 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -784,6 +784,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue PromoteFloatRes_LOAD(SDNode *N); SDValue PromoteFloatRes_ATOMIC_LOAD(SDNode *N); SDValue PromoteFloatRes_SELECT(SDNode *N); + SDValue PromoteFloatRes_CTSELECT(SDNode *N); SDValue PromoteFloatRes_SELECT_CC(SDNode *N); SDValue PromoteFloatRes_UnaryOp(SDNode *N); SDValue PromoteFloatRes_AssertNoFPClass(SDNode *N); @@ -834,6 +835,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue SoftPromoteHalfRes_LOAD(SDNode *N); SDValue SoftPromoteHalfRes_ATOMIC_LOAD(SDNode *N); SDValue SoftPromoteHalfRes_SELECT(SDNode *N); + SDValue SoftPromoteHalfRes_CTSELECT(SDNode *N); SDValue SoftPromoteHalfRes_SELECT_CC(SDNode *N); SDValue SoftPromoteHalfRes_UnaryOp(SDNode *N); SDValue SoftPromoteHalfRes_FABS(SDNode *N); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp index 098368ef2f6b3..a7187f3ae2bc7 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp @@ -571,17 +571,9 @@ void DAGTypeLegalizer::SplitRes_Select(SDNode *N, SDValue &Lo, SDValue &Hi) { } void DAGTypeLegalizer::SplitRes_CTSELECT(SDNode *N, SDValue &Lo, SDValue &Hi) { - SDValue LL, LH, RL, RH, CL, CH; - SDLoc dl(N); - GetSplitOp(N->getOperand(1), LL, LH); - GetSplitOp(N->getOperand(2), RL, RH); - - SDValue Cond = N->getOperand(0); - CL = CH = Cond; - assert(!Cond.getValueType().isVector() && "Unsupported vector type"); - - Lo = DAG.getNode(N->getOpcode(), dl, LL.getValueType(), CL, LL, RL); - Hi = DAG.getNode(N->getOpcode(), dl, LH.getValueType(), CH, LH, RH); + // Reuse generic select splitting to support scalar and vector conditions. + // SplitRes_Select rebuilds with N->getOpcode(), so CTSELECT is preserved. + SplitRes_Select(N, Lo, Hi); } void DAGTypeLegalizer::SplitRes_SELECT_CC(SDNode *N, SDValue &Lo, diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 6eee61da2e0d1..b7e78b94c0687 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -8179,7 +8179,6 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, return V; break; } - case ISD::SELECT: case ISD::VSELECT: if (SDValue V = simplifySelect(N1, N2, N3)) diff --git a/llvm/test/CodeGen/RISCV/ctselect-fallback.ll b/llvm/test/CodeGen/RISCV/ctselect-fallback.ll index f46bde0a05b8b..c624c17d7e33e 100644 --- a/llvm/test/CodeGen/RISCV/ctselect-fallback.ll +++ b/llvm/test/CodeGen/RISCV/ctselect-fallback.ll @@ -10,7 +10,7 @@ define i8 @test_ctselect_i8(i1 %cond, i8 %a, i8 %b) { ; RV64-NEXT: slli a0, a0, 63 ; RV64-NEXT: srai a0, a0, 63 ; RV64-NEXT: and a0, a1, a0 -; RV64-NEXT: xor a0, a0, a2 +; RV64-NEXT: xor a0, a2, a0 ; RV64-NEXT: ret ; ; RV32-LABEL: test_ctselect_i8: @@ -19,52 +19,28 @@ define i8 @test_ctselect_i8(i1 %cond, i8 %a, i8 %b) { ; RV32-NEXT: slli a0, a0, 31 ; RV32-NEXT: srai a0, a0, 31 ; RV32-NEXT: and a0, a1, a0 -; RV32-NEXT: xor a0, a0, a2 +; RV32-NEXT: xor a0, a2, a0 ; RV32-NEXT: ret %result = call i8 @llvm.ct.select.i8(i1 %cond, i8 %a, i8 %b) ret i8 %result } - -define i16 @test_ctselect_i16(i1 %cond, i16 %a, i16 %b) { -; RV64-LABEL: test_ctselect_i16: +define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) { +; RV64-LABEL: test_ctselect_i32: ; RV64: # %bb.0: ; RV64-NEXT: xor a1, a1, a2 ; RV64-NEXT: slli a0, a0, 63 ; RV64-NEXT: srai a0, a0, 63 ; RV64-NEXT: and a0, a1, a0 -; RV64-NEXT: xor a0, a0, a2 +; RV64-NEXT: xor a0, a2, a0 ; RV64-NEXT: ret ; -; RV32-LABEL: test_ctselect_i16: +; RV32-LABEL: test_ctselect_i32: ; RV32: # %bb.0: ; RV32-NEXT: xor a1, a1, a2 ; RV32-NEXT: slli a0, a0, 31 ; RV32-NEXT: srai a0, a0, 31 ; RV32-NEXT: and a0, a1, a0 -; RV32-NEXT: xor a0, a0, a2 -; RV32-NEXT: ret - %result = call i16 @llvm.ct.select.i16(i1 %cond, i16 %a, i16 %b) - ret i16 %result -} - -define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) { -; RV64-LABEL: test_ctselect_i32: -; RV64: # %bb.0: -; RV64-NEXT: xor a1, a1, a2 -; RV64-NEXT: slli a0, a0, 63 -; RV64-NEXT: srai a0, a0, 63 -; RV64-NEXT: and a0, a1, a0 -; RV64-NEXT: xor a0, a0, a2 -; RV64-NEXT: ret -; -; RV32-LABEL: test_ctselect_i32: -; RV32: # %bb.0: -; RV32-NEXT: andi a0, a0, 1 -; RV32-NEXT: neg a3, a0 -; RV32-NEXT: addi a0, a0, -1 -; RV32-NEXT: and a1, a3, a1 -; RV32-NEXT: and a0, a0, a2 -; RV32-NEXT: or a0, a1, a0 +; RV32-NEXT: xor a0, a2, a0 ; RV32-NEXT: ret %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) ret i32 %result @@ -73,12 +49,11 @@ define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) { define i64 @test_ctselect_i64(i1 %cond, i64 %a, i64 %b) { ; RV64-LABEL: test_ctselect_i64: ; RV64: # %bb.0: -; RV64-NEXT: andi a0, a0, 1 -; RV64-NEXT: neg a3, a0 -; RV64-NEXT: addi a0, a0, -1 -; RV64-NEXT: and a1, a3, a1 -; RV64-NEXT: and a0, a0, a2 -; RV64-NEXT: or a0, a1, a0 +; RV64-NEXT: xor a1, a1, a2 +; RV64-NEXT: slli a0, a0, 63 +; RV64-NEXT: srai a0, a0, 63 +; RV64-NEXT: and a0, a1, a0 +; RV64-NEXT: xor a0, a2, a0 ; RV64-NEXT: ret ; ; RV32-LABEL: test_ctselect_i64: @@ -89,8 +64,8 @@ define i64 @test_ctselect_i64(i1 %cond, i64 %a, i64 %b) { ; RV32-NEXT: srai a0, a0, 31 ; RV32-NEXT: and a1, a1, a0 ; RV32-NEXT: and a2, a2, a0 -; RV32-NEXT: xor a0, a1, a3 -; RV32-NEXT: xor a1, a2, a4 +; RV32-NEXT: xor a0, a3, a1 +; RV32-NEXT: xor a1, a4, a2 ; RV32-NEXT: ret %result = call i64 @llvm.ct.select.i64(i1 %cond, i64 %a, i64 %b) ret i64 %result @@ -99,22 +74,20 @@ define i64 @test_ctselect_i64(i1 %cond, i64 %a, i64 %b) { define ptr @test_ctselect_ptr(i1 %cond, ptr %a, ptr %b) { ; RV64-LABEL: test_ctselect_ptr: ; RV64: # %bb.0: -; RV64-NEXT: andi a0, a0, 1 -; RV64-NEXT: neg a3, a0 -; RV64-NEXT: addi a0, a0, -1 -; RV64-NEXT: and a1, a3, a1 -; RV64-NEXT: and a0, a0, a2 -; RV64-NEXT: or a0, a1, a0 +; RV64-NEXT: xor a1, a1, a2 +; RV64-NEXT: slli a0, a0, 63 +; RV64-NEXT: srai a0, a0, 63 +; RV64-NEXT: and a0, a1, a0 +; RV64-NEXT: xor a0, a2, a0 ; RV64-NEXT: ret ; ; RV32-LABEL: test_ctselect_ptr: ; RV32: # %bb.0: -; RV32-NEXT: andi a0, a0, 1 -; RV32-NEXT: neg a3, a0 -; RV32-NEXT: addi a0, a0, -1 -; RV32-NEXT: and a1, a3, a1 -; RV32-NEXT: and a0, a0, a2 -; RV32-NEXT: or a0, a1, a0 +; RV32-NEXT: xor a1, a1, a2 +; RV32-NEXT: slli a0, a0, 31 +; RV32-NEXT: srai a0, a0, 31 +; RV32-NEXT: and a0, a1, a0 +; RV32-NEXT: xor a0, a2, a0 ; RV32-NEXT: ret %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %a, ptr %b) ret ptr %result @@ -128,6 +101,8 @@ define i32 @test_ctselect_const_true(i32 %a, i32 %b) { ; ; RV32-LABEL: test_ctselect_const_true: ; RV32: # %bb.0: +; RV32-NEXT: xor a0, a0, a1 +; RV32-NEXT: xor a0, a1, a0 ; RV32-NEXT: ret %result = call i32 @llvm.ct.select.i32(i1 true, i32 %a, i32 %b) ret i32 %result @@ -158,131 +133,199 @@ define i32 @test_ctselect_icmp_eq(i32 %x, i32 %y, i32 %a, i32 %b) { ; RV64-NEXT: xor a2, a2, a3 ; RV64-NEXT: addi a0, a0, -1 ; RV64-NEXT: and a0, a2, a0 -; RV64-NEXT: xor a0, a0, a3 +; RV64-NEXT: xor a0, a3, a0 ; RV64-NEXT: ret ; ; RV32-LABEL: test_ctselect_icmp_eq: ; RV32: # %bb.0: ; RV32-NEXT: xor a0, a0, a1 ; RV32-NEXT: snez a0, a0 +; RV32-NEXT: xor a2, a2, a3 ; RV32-NEXT: addi a0, a0, -1 -; RV32-NEXT: and a2, a0, a2 -; RV32-NEXT: not a0, a0 -; RV32-NEXT: and a0, a0, a3 -; RV32-NEXT: or a0, a2, a0 +; RV32-NEXT: and a0, a2, a0 +; RV32-NEXT: xor a0, a3, a0 ; RV32-NEXT: ret %cond = icmp eq i32 %x, %y %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) ret i32 %result } - -define i32 @test_ctselect_icmp_ne(i32 %x, i32 %y, i32 %a, i32 %b) { -; RV64-LABEL: test_ctselect_icmp_ne: +define i32 @test_ctselect_icmp_ult(i32 %x, i32 %y, i32 %a, i32 %b) { +; RV64-LABEL: test_ctselect_icmp_ult: ; RV64: # %bb.0: ; RV64-NEXT: sext.w a1, a1 ; RV64-NEXT: sext.w a0, a0 -; RV64-NEXT: xor a0, a0, a1 -; RV64-NEXT: seqz a0, a0 +; RV64-NEXT: sltu a0, a0, a1 ; RV64-NEXT: xor a2, a2, a3 -; RV64-NEXT: addi a0, a0, -1 +; RV64-NEXT: neg a0, a0 ; RV64-NEXT: and a0, a2, a0 -; RV64-NEXT: xor a0, a0, a3 +; RV64-NEXT: xor a0, a3, a0 ; RV64-NEXT: ret ; -; RV32-LABEL: test_ctselect_icmp_ne: +; RV32-LABEL: test_ctselect_icmp_ult: ; RV32: # %bb.0: -; RV32-NEXT: xor a0, a0, a1 -; RV32-NEXT: seqz a0, a0 -; RV32-NEXT: addi a0, a0, -1 -; RV32-NEXT: and a2, a0, a2 -; RV32-NEXT: not a0, a0 -; RV32-NEXT: and a0, a0, a3 -; RV32-NEXT: or a0, a2, a0 +; RV32-NEXT: sltu a0, a0, a1 +; RV32-NEXT: xor a2, a2, a3 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: and a0, a2, a0 +; RV32-NEXT: xor a0, a3, a0 ; RV32-NEXT: ret - %cond = icmp ne i32 %x, %y + %cond = icmp ult i32 %x, %y %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) ret i32 %result } -define i32 @test_ctselect_icmp_slt(i32 %x, i32 %y, i32 %a, i32 %b) { -; RV64-LABEL: test_ctselect_icmp_slt: +; Test with memory operands +define i32 @test_ctselect_load(i1 %cond, ptr %p1, ptr %p2) { +; RV64-LABEL: test_ctselect_load: ; RV64: # %bb.0: -; RV64-NEXT: sext.w a1, a1 -; RV64-NEXT: sext.w a0, a0 -; RV64-NEXT: slt a0, a0, a1 +; RV64-NEXT: lw a1, 0(a1) +; RV64-NEXT: lw a2, 0(a2) +; RV64-NEXT: slli a0, a0, 63 +; RV64-NEXT: xor a1, a1, a2 +; RV64-NEXT: srai a0, a0, 63 +; RV64-NEXT: and a0, a1, a0 +; RV64-NEXT: xor a0, a2, a0 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_load: +; RV32: # %bb.0: +; RV32-NEXT: lw a1, 0(a1) +; RV32-NEXT: lw a2, 0(a2) +; RV32-NEXT: slli a0, a0, 31 +; RV32-NEXT: xor a1, a1, a2 +; RV32-NEXT: srai a0, a0, 31 +; RV32-NEXT: and a0, a1, a0 +; RV32-NEXT: xor a0, a2, a0 +; RV32-NEXT: ret + %a = load i32, ptr %p1 + %b = load i32, ptr %p2 + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +; Test nested CTSELECT pattern with AND merging on i1 values +; Pattern: ctselect C0, (ctselect C1, X, Y), Y -> ctselect (C0 & C1), X, Y +define i32 @test_ctselect_nested_and_i1_to_i32(i1 %c0, i1 %c1, i32 %x, i32 %y) { +; RV64-LABEL: test_ctselect_nested_and_i1_to_i32: +; RV64: # %bb.0: +; RV64-NEXT: and a0, a1, a0 ; RV64-NEXT: xor a2, a2, a3 -; RV64-NEXT: neg a0, a0 +; RV64-NEXT: slli a0, a0, 63 +; RV64-NEXT: srai a0, a0, 63 ; RV64-NEXT: and a0, a2, a0 -; RV64-NEXT: xor a0, a0, a3 +; RV64-NEXT: xor a0, a3, a0 ; RV64-NEXT: ret ; -; RV32-LABEL: test_ctselect_icmp_slt: +; RV32-LABEL: test_ctselect_nested_and_i1_to_i32: ; RV32: # %bb.0: -; RV32-NEXT: slt a0, a0, a1 -; RV32-NEXT: neg a1, a0 -; RV32-NEXT: addi a0, a0, -1 -; RV32-NEXT: and a1, a1, a2 -; RV32-NEXT: and a0, a0, a3 -; RV32-NEXT: or a0, a1, a0 +; RV32-NEXT: and a0, a1, a0 +; RV32-NEXT: xor a2, a2, a3 +; RV32-NEXT: slli a0, a0, 31 +; RV32-NEXT: srai a0, a0, 31 +; RV32-NEXT: and a0, a2, a0 +; RV32-NEXT: xor a0, a3, a0 ; RV32-NEXT: ret - %cond = icmp slt i32 %x, %y - %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + %inner = call i1 @llvm.ct.select.i1(i1 %c1, i1 true, i1 false) + %cond = call i1 @llvm.ct.select.i1(i1 %c0, i1 %inner, i1 false) + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %x, i32 %y) ret i32 %result } -define i32 @test_ctselect_icmp_ult(i32 %x, i32 %y, i32 %a, i32 %b) { -; RV64-LABEL: test_ctselect_icmp_ult: +; Test nested CTSELECT pattern with OR merging on i1 values +; Pattern: ctselect C0, X, (ctselect C1, X, Y) -> ctselect (C0 | C1), X, Y +define i32 @test_ctselect_nested_or_i1_to_i32(i1 %c0, i1 %c1, i32 %x, i32 %y) { +; RV64-LABEL: test_ctselect_nested_or_i1_to_i32: ; RV64: # %bb.0: -; RV64-NEXT: sext.w a1, a1 -; RV64-NEXT: sext.w a0, a0 -; RV64-NEXT: sltu a0, a0, a1 +; RV64-NEXT: or a0, a0, a1 ; RV64-NEXT: xor a2, a2, a3 -; RV64-NEXT: neg a0, a0 +; RV64-NEXT: slli a0, a0, 63 +; RV64-NEXT: srai a0, a0, 63 ; RV64-NEXT: and a0, a2, a0 -; RV64-NEXT: xor a0, a0, a3 +; RV64-NEXT: xor a0, a3, a0 ; RV64-NEXT: ret ; -; RV32-LABEL: test_ctselect_icmp_ult: +; RV32-LABEL: test_ctselect_nested_or_i1_to_i32: ; RV32: # %bb.0: -; RV32-NEXT: sltu a0, a0, a1 -; RV32-NEXT: neg a1, a0 -; RV32-NEXT: addi a0, a0, -1 -; RV32-NEXT: and a1, a1, a2 -; RV32-NEXT: and a0, a0, a3 -; RV32-NEXT: or a0, a1, a0 +; RV32-NEXT: or a0, a0, a1 +; RV32-NEXT: xor a2, a2, a3 +; RV32-NEXT: slli a0, a0, 31 +; RV32-NEXT: srai a0, a0, 31 +; RV32-NEXT: and a0, a2, a0 +; RV32-NEXT: xor a0, a3, a0 ; RV32-NEXT: ret - %cond = icmp ult i32 %x, %y - %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + %inner = call i1 @llvm.ct.select.i1(i1 %c1, i1 true, i1 false) + %cond = call i1 @llvm.ct.select.i1(i1 %c0, i1 true, i1 %inner) + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %x, i32 %y) ret i32 %result } -; Test with memory operands -define i32 @test_ctselect_load(i1 %cond, ptr %p1, ptr %p2) { -; RV64-LABEL: test_ctselect_load: +; Test double nested CTSELECT with recursive AND merging +; Pattern: ctselect C0, (ctselect C1, (ctselect C2, X, Y), Y), Y +; -> ctselect (C0 & C1 & C2), X, Y +define i32 @test_ctselect_double_nested_and_i1(i1 %c0, i1 %c1, i1 %c2, i32 %x, i32 %y) { +; RV64-LABEL: test_ctselect_double_nested_and_i1: ; RV64: # %bb.0: -; RV64-NEXT: lw a1, 0(a1) -; RV64-NEXT: lw a2, 0(a2) +; RV64-NEXT: and a1, a2, a1 +; RV64-NEXT: and a0, a1, a0 +; RV64-NEXT: xor a3, a3, a4 ; RV64-NEXT: slli a0, a0, 63 -; RV64-NEXT: xor a1, a1, a2 ; RV64-NEXT: srai a0, a0, 63 +; RV64-NEXT: and a0, a3, a0 +; RV64-NEXT: xor a0, a4, a0 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_double_nested_and_i1: +; RV32: # %bb.0: +; RV32-NEXT: and a1, a2, a1 +; RV32-NEXT: and a0, a1, a0 +; RV32-NEXT: xor a3, a3, a4 +; RV32-NEXT: slli a0, a0, 31 +; RV32-NEXT: srai a0, a0, 31 +; RV32-NEXT: and a0, a3, a0 +; RV32-NEXT: xor a0, a4, a0 +; RV32-NEXT: ret + %inner2 = call i1 @llvm.ct.select.i1(i1 %c2, i1 true, i1 false) + %inner1 = call i1 @llvm.ct.select.i1(i1 %c1, i1 %inner2, i1 false) + %cond = call i1 @llvm.ct.select.i1(i1 %c0, i1 %inner1, i1 false) + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %x, i32 %y) + ret i32 %result +} + +; Test double nested CTSELECT with mixed AND/OR patterns +define i32 @test_ctselect_double_nested_mixed_i1(i1 %c0, i1 %c1, i1 %c2, i32 %x, i32 %y, i32 %z) { +; RV64-LABEL: test_ctselect_double_nested_mixed_i1: +; RV64: # %bb.0: ; RV64-NEXT: and a0, a1, a0 -; RV64-NEXT: xor a0, a0, a2 +; RV64-NEXT: xor a3, a3, a4 +; RV64-NEXT: or a0, a0, a2 +; RV64-NEXT: slli a0, a0, 63 +; RV64-NEXT: srai a0, a0, 63 +; RV64-NEXT: and a3, a3, a0 +; RV64-NEXT: xor a4, a4, a5 +; RV64-NEXT: xor a3, a4, a3 +; RV64-NEXT: and a0, a3, a0 +; RV64-NEXT: xor a0, a5, a0 ; RV64-NEXT: ret ; -; RV32-LABEL: test_ctselect_load: +; RV32-LABEL: test_ctselect_double_nested_mixed_i1: ; RV32: # %bb.0: -; RV32-NEXT: lw a1, 0(a1) -; RV32-NEXT: lw a2, 0(a2) -; RV32-NEXT: andi a0, a0, 1 -; RV32-NEXT: neg a3, a0 -; RV32-NEXT: addi a0, a0, -1 -; RV32-NEXT: and a1, a3, a1 -; RV32-NEXT: and a0, a0, a2 -; RV32-NEXT: or a0, a1, a0 +; RV32-NEXT: and a0, a1, a0 +; RV32-NEXT: xor a3, a3, a4 +; RV32-NEXT: or a0, a0, a2 +; RV32-NEXT: slli a0, a0, 31 +; RV32-NEXT: srai a0, a0, 31 +; RV32-NEXT: and a3, a3, a0 +; RV32-NEXT: xor a4, a4, a5 +; RV32-NEXT: xor a3, a4, a3 +; RV32-NEXT: and a0, a3, a0 +; RV32-NEXT: xor a0, a5, a0 ; RV32-NEXT: ret - %a = load i32, ptr %p1 - %b = load i32, ptr %p2 - %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + %inner1 = call i1 @llvm.ct.select.i1(i1 %c1, i1 true, i1 false) + %and_cond = call i1 @llvm.ct.select.i1(i1 %c0, i1 %inner1, i1 false) + %inner2 = call i1 @llvm.ct.select.i1(i1 %c2, i1 true, i1 false) + %or_cond = call i1 @llvm.ct.select.i1(i1 %and_cond, i1 true, i1 %inner2) + %inner_result = call i32 @llvm.ct.select.i32(i1 %or_cond, i32 %x, i32 %y) + %result = call i32 @llvm.ct.select.i32(i1 %or_cond, i32 %inner_result, i32 %z) ret i32 %result } @@ -296,35 +339,416 @@ define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) { ; RV64-NEXT: slli a0, a0, 63 ; RV64-NEXT: srai a1, a1, 63 ; RV64-NEXT: and a1, a2, a1 -; RV64-NEXT: xor a1, a1, a3 +; RV64-NEXT: xor a1, a3, a1 ; RV64-NEXT: srai a0, a0, 63 ; RV64-NEXT: and a0, a1, a0 -; RV64-NEXT: xor a0, a0, a4 +; RV64-NEXT: xor a0, a4, a0 ; RV64-NEXT: ret ; ; RV32-LABEL: test_ctselect_nested: ; RV32: # %bb.0: -; RV32-NEXT: andi a1, a1, 1 -; RV32-NEXT: andi a0, a0, 1 -; RV32-NEXT: neg a5, a1 -; RV32-NEXT: addi a1, a1, -1 -; RV32-NEXT: and a2, a5, a2 -; RV32-NEXT: neg a5, a0 -; RV32-NEXT: addi a0, a0, -1 -; RV32-NEXT: and a1, a1, a3 -; RV32-NEXT: or a1, a2, a1 -; RV32-NEXT: and a1, a5, a1 -; RV32-NEXT: and a0, a0, a4 -; RV32-NEXT: or a0, a1, a0 +; RV32-NEXT: xor a2, a2, a3 +; RV32-NEXT: slli a1, a1, 31 +; RV32-NEXT: xor a3, a3, a4 +; RV32-NEXT: slli a0, a0, 31 +; RV32-NEXT: srai a1, a1, 31 +; RV32-NEXT: and a1, a2, a1 +; RV32-NEXT: xor a1, a3, a1 +; RV32-NEXT: srai a0, a0, 31 +; RV32-NEXT: and a0, a1, a0 +; RV32-NEXT: xor a0, a4, a0 ; RV32-NEXT: ret %inner = call i32 @llvm.ct.select.i32(i1 %cond2, i32 %a, i32 %b) %result = call i32 @llvm.ct.select.i32(i1 %cond1, i32 %inner, i32 %c) ret i32 %result } +; Test floating-point ct.select selecting between NaN and Inf +define float @test_ctselect_f32_nan_inf(i1 %cond) { +; RV64-LABEL: test_ctselect_f32_nan_inf: +; RV64: # %bb.0: +; RV64-NEXT: slli a0, a0, 63 +; RV64-NEXT: lui a1, 1024 +; RV64-NEXT: srai a0, a0, 63 +; RV64-NEXT: and a0, a0, a1 +; RV64-NEXT: lui a1, 522240 +; RV64-NEXT: or a0, a0, a1 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_f32_nan_inf: +; RV32: # %bb.0: +; RV32-NEXT: slli a0, a0, 31 +; RV32-NEXT: lui a1, 1024 +; RV32-NEXT: srai a0, a0, 31 +; RV32-NEXT: and a0, a0, a1 +; RV32-NEXT: lui a1, 522240 +; RV32-NEXT: xor a0, a0, a1 +; RV32-NEXT: ret + %result = call float @llvm.ct.select.f32(i1 %cond, float 0x7FF8000000000000, float 0x7FF0000000000000) + ret float %result +} + +define double @test_ctselect_f64_nan_inf(i1 %cond) { +; RV64-LABEL: test_ctselect_f64_nan_inf: +; RV64: # %bb.0: +; RV64-NEXT: slli a0, a0, 63 +; RV64-NEXT: li a1, 1 +; RV64-NEXT: srai a0, a0, 63 +; RV64-NEXT: slli a1, a1, 51 +; RV64-NEXT: and a0, a0, a1 +; RV64-NEXT: li a1, 2047 +; RV64-NEXT: slli a1, a1, 52 +; RV64-NEXT: xor a0, a0, a1 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_f64_nan_inf: +; RV32: # %bb.0: +; RV32-NEXT: slli a0, a0, 31 +; RV32-NEXT: lui a1, 128 +; RV32-NEXT: srai a0, a0, 31 +; RV32-NEXT: and a0, a0, a1 +; RV32-NEXT: lui a1, 524032 +; RV32-NEXT: or a1, a0, a1 +; RV32-NEXT: li a0, 0 +; RV32-NEXT: ret + %result = call double @llvm.ct.select.f64(i1 %cond, double 0x7FF8000000000000, double 0x7FF0000000000000) + ret double %result +} + +; Test basic floating-point ct.select +define float @test_ctselect_f32(i1 %cond, float %a, float %b) { +; RV64-LABEL: test_ctselect_f32: +; RV64: # %bb.0: +; RV64-NEXT: xor a1, a1, a2 +; RV64-NEXT: slli a0, a0, 63 +; RV64-NEXT: srai a0, a0, 63 +; RV64-NEXT: and a0, a1, a0 +; RV64-NEXT: xor a0, a2, a0 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_f32: +; RV32: # %bb.0: +; RV32-NEXT: xor a1, a1, a2 +; RV32-NEXT: slli a0, a0, 31 +; RV32-NEXT: srai a0, a0, 31 +; RV32-NEXT: and a0, a1, a0 +; RV32-NEXT: xor a0, a2, a0 +; RV32-NEXT: ret + %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b) + ret float %result +} + +define double @test_ctselect_f64(i1 %cond, double %a, double %b) { +; RV64-LABEL: test_ctselect_f64: +; RV64: # %bb.0: +; RV64-NEXT: xor a1, a1, a2 +; RV64-NEXT: slli a0, a0, 63 +; RV64-NEXT: srai a0, a0, 63 +; RV64-NEXT: and a0, a1, a0 +; RV64-NEXT: xor a0, a2, a0 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_f64: +; RV32: # %bb.0: +; RV32-NEXT: xor a1, a1, a3 +; RV32-NEXT: slli a0, a0, 31 +; RV32-NEXT: xor a2, a2, a4 +; RV32-NEXT: srai a0, a0, 31 +; RV32-NEXT: and a1, a1, a0 +; RV32-NEXT: and a2, a2, a0 +; RV32-NEXT: xor a0, a3, a1 +; RV32-NEXT: xor a1, a4, a2 +; RV32-NEXT: ret + %result = call double @llvm.ct.select.f64(i1 %cond, double %a, double %b) + ret double %result +} + +; Test vector ct.select with integer vectors +define <4 x i32> @test_ctselect_v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) { +; RV64-LABEL: test_ctselect_v4i32: +; RV64: # %bb.0: +; RV64-NEXT: lw a4, 0(a3) +; RV64-NEXT: lw a5, 8(a3) +; RV64-NEXT: lw a6, 16(a3) +; RV64-NEXT: lw a3, 24(a3) +; RV64-NEXT: lw a7, 0(a2) +; RV64-NEXT: lw t0, 8(a2) +; RV64-NEXT: lw t1, 16(a2) +; RV64-NEXT: lw a2, 24(a2) +; RV64-NEXT: slli a1, a1, 63 +; RV64-NEXT: srai a1, a1, 63 +; RV64-NEXT: xor a7, a7, a4 +; RV64-NEXT: xor t0, t0, a5 +; RV64-NEXT: xor t1, t1, a6 +; RV64-NEXT: xor a2, a2, a3 +; RV64-NEXT: and a7, a7, a1 +; RV64-NEXT: and t0, t0, a1 +; RV64-NEXT: and t1, t1, a1 +; RV64-NEXT: and a1, a2, a1 +; RV64-NEXT: xor a2, a4, a7 +; RV64-NEXT: xor a4, a5, t0 +; RV64-NEXT: xor a5, a6, t1 +; RV64-NEXT: xor a1, a3, a1 +; RV64-NEXT: sw a2, 0(a0) +; RV64-NEXT: sw a4, 4(a0) +; RV64-NEXT: sw a5, 8(a0) +; RV64-NEXT: sw a1, 12(a0) +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_v4i32: +; RV32: # %bb.0: +; RV32-NEXT: lw a4, 0(a3) +; RV32-NEXT: lw a5, 4(a3) +; RV32-NEXT: lw a6, 8(a3) +; RV32-NEXT: lw a3, 12(a3) +; RV32-NEXT: lw a7, 0(a2) +; RV32-NEXT: lw t0, 4(a2) +; RV32-NEXT: lw t1, 8(a2) +; RV32-NEXT: lw a2, 12(a2) +; RV32-NEXT: slli a1, a1, 31 +; RV32-NEXT: srai a1, a1, 31 +; RV32-NEXT: xor a7, a7, a4 +; RV32-NEXT: xor t0, t0, a5 +; RV32-NEXT: xor t1, t1, a6 +; RV32-NEXT: xor a2, a2, a3 +; RV32-NEXT: and a7, a7, a1 +; RV32-NEXT: and t0, t0, a1 +; RV32-NEXT: and t1, t1, a1 +; RV32-NEXT: and a1, a2, a1 +; RV32-NEXT: xor a2, a4, a7 +; RV32-NEXT: xor a4, a5, t0 +; RV32-NEXT: xor a5, a6, t1 +; RV32-NEXT: xor a1, a3, a1 +; RV32-NEXT: sw a2, 0(a0) +; RV32-NEXT: sw a4, 4(a0) +; RV32-NEXT: sw a5, 8(a0) +; RV32-NEXT: sw a1, 12(a0) +; RV32-NEXT: ret + %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %result +} +define <4 x float> @test_ctselect_v4f32(i1 %cond, <4 x float> %a, <4 x float> %b) { +; RV64-LABEL: test_ctselect_v4f32: +; RV64: # %bb.0: +; RV64-NEXT: lw a4, 0(a3) +; RV64-NEXT: lw a5, 8(a3) +; RV64-NEXT: lw a6, 16(a3) +; RV64-NEXT: lw a3, 24(a3) +; RV64-NEXT: lw a7, 0(a2) +; RV64-NEXT: lw t0, 8(a2) +; RV64-NEXT: lw t1, 16(a2) +; RV64-NEXT: lw a2, 24(a2) +; RV64-NEXT: slli a1, a1, 63 +; RV64-NEXT: srai a1, a1, 63 +; RV64-NEXT: xor a7, a7, a4 +; RV64-NEXT: xor t0, t0, a5 +; RV64-NEXT: xor t1, t1, a6 +; RV64-NEXT: xor a2, a2, a3 +; RV64-NEXT: and a7, a7, a1 +; RV64-NEXT: and t0, t0, a1 +; RV64-NEXT: and t1, t1, a1 +; RV64-NEXT: and a1, a2, a1 +; RV64-NEXT: xor a2, a4, a7 +; RV64-NEXT: xor a4, a5, t0 +; RV64-NEXT: xor a5, a6, t1 +; RV64-NEXT: xor a1, a3, a1 +; RV64-NEXT: sw a2, 0(a0) +; RV64-NEXT: sw a4, 4(a0) +; RV64-NEXT: sw a5, 8(a0) +; RV64-NEXT: sw a1, 12(a0) +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_v4f32: +; RV32: # %bb.0: +; RV32-NEXT: lw a4, 0(a3) +; RV32-NEXT: lw a5, 4(a3) +; RV32-NEXT: lw a6, 8(a3) +; RV32-NEXT: lw a3, 12(a3) +; RV32-NEXT: lw a7, 0(a2) +; RV32-NEXT: lw t0, 4(a2) +; RV32-NEXT: lw t1, 8(a2) +; RV32-NEXT: lw a2, 12(a2) +; RV32-NEXT: slli a1, a1, 31 +; RV32-NEXT: srai a1, a1, 31 +; RV32-NEXT: xor a7, a7, a4 +; RV32-NEXT: xor t0, t0, a5 +; RV32-NEXT: xor t1, t1, a6 +; RV32-NEXT: xor a2, a2, a3 +; RV32-NEXT: and a7, a7, a1 +; RV32-NEXT: and t0, t0, a1 +; RV32-NEXT: and t1, t1, a1 +; RV32-NEXT: and a1, a2, a1 +; RV32-NEXT: xor a2, a4, a7 +; RV32-NEXT: xor a4, a5, t0 +; RV32-NEXT: xor a5, a6, t1 +; RV32-NEXT: xor a1, a3, a1 +; RV32-NEXT: sw a2, 0(a0) +; RV32-NEXT: sw a4, 4(a0) +; RV32-NEXT: sw a5, 8(a0) +; RV32-NEXT: sw a1, 12(a0) +; RV32-NEXT: ret + %result = call <4 x float> @llvm.ct.select.v4f32(i1 %cond, <4 x float> %a, <4 x float> %b) + ret <4 x float> %result +} +define <8 x i32> @test_ctselect_v8i32(i1 %cond, <8 x i32> %a, <8 x i32> %b) { +; RV64-LABEL: test_ctselect_v8i32: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -32 +; RV64-NEXT: .cfi_def_cfa_offset 32 +; RV64-NEXT: sd s0, 24(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s1, 16(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s2, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: .cfi_offset s0, -8 +; RV64-NEXT: .cfi_offset s1, -16 +; RV64-NEXT: .cfi_offset s2, -24 +; RV64-NEXT: lw a7, 32(a3) +; RV64-NEXT: lw a6, 40(a3) +; RV64-NEXT: lw a5, 48(a3) +; RV64-NEXT: lw a4, 56(a3) +; RV64-NEXT: lw t0, 32(a2) +; RV64-NEXT: lw t1, 40(a2) +; RV64-NEXT: lw t2, 48(a2) +; RV64-NEXT: lw t3, 56(a2) +; RV64-NEXT: lw t4, 0(a3) +; RV64-NEXT: lw t5, 8(a3) +; RV64-NEXT: lw t6, 16(a3) +; RV64-NEXT: lw a3, 24(a3) +; RV64-NEXT: lw s0, 0(a2) +; RV64-NEXT: lw s1, 8(a2) +; RV64-NEXT: lw s2, 16(a2) +; RV64-NEXT: lw a2, 24(a2) +; RV64-NEXT: slli a1, a1, 63 +; RV64-NEXT: srai a1, a1, 63 +; RV64-NEXT: xor s0, s0, t4 +; RV64-NEXT: xor s1, s1, t5 +; RV64-NEXT: xor s2, s2, t6 +; RV64-NEXT: xor a2, a2, a3 +; RV64-NEXT: xor t0, t0, a7 +; RV64-NEXT: xor t1, t1, a6 +; RV64-NEXT: xor t2, t2, a5 +; RV64-NEXT: xor t3, t3, a4 +; RV64-NEXT: and s0, s0, a1 +; RV64-NEXT: and s1, s1, a1 +; RV64-NEXT: and s2, s2, a1 +; RV64-NEXT: and a2, a2, a1 +; RV64-NEXT: and t0, t0, a1 +; RV64-NEXT: and t1, t1, a1 +; RV64-NEXT: and t2, t2, a1 +; RV64-NEXT: and a1, t3, a1 +; RV64-NEXT: xor t3, t4, s0 +; RV64-NEXT: xor t4, t5, s1 +; RV64-NEXT: xor t5, t6, s2 +; RV64-NEXT: xor a2, a3, a2 +; RV64-NEXT: xor a3, a7, t0 +; RV64-NEXT: xor a6, a6, t1 +; RV64-NEXT: xor a5, a5, t2 +; RV64-NEXT: xor a1, a4, a1 +; RV64-NEXT: sw a3, 16(a0) +; RV64-NEXT: sw a6, 20(a0) +; RV64-NEXT: sw a5, 24(a0) +; RV64-NEXT: sw a1, 28(a0) +; RV64-NEXT: sw t3, 0(a0) +; RV64-NEXT: sw t4, 4(a0) +; RV64-NEXT: sw t5, 8(a0) +; RV64-NEXT: sw a2, 12(a0) +; RV64-NEXT: ld s0, 24(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s1, 16(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s2, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: .cfi_restore s0 +; RV64-NEXT: .cfi_restore s1 +; RV64-NEXT: .cfi_restore s2 +; RV64-NEXT: addi sp, sp, 32 +; RV64-NEXT: .cfi_def_cfa_offset 0 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_v8i32: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s1, 8(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s2, 4(sp) # 4-byte Folded Spill +; RV32-NEXT: .cfi_offset s0, -4 +; RV32-NEXT: .cfi_offset s1, -8 +; RV32-NEXT: .cfi_offset s2, -12 +; RV32-NEXT: lw a7, 16(a3) +; RV32-NEXT: lw a6, 20(a3) +; RV32-NEXT: lw a5, 24(a3) +; RV32-NEXT: lw a4, 28(a3) +; RV32-NEXT: lw t0, 16(a2) +; RV32-NEXT: lw t1, 20(a2) +; RV32-NEXT: lw t2, 24(a2) +; RV32-NEXT: lw t3, 28(a2) +; RV32-NEXT: lw t4, 0(a3) +; RV32-NEXT: lw t5, 4(a3) +; RV32-NEXT: lw t6, 8(a3) +; RV32-NEXT: lw a3, 12(a3) +; RV32-NEXT: lw s0, 0(a2) +; RV32-NEXT: lw s1, 4(a2) +; RV32-NEXT: lw s2, 8(a2) +; RV32-NEXT: lw a2, 12(a2) +; RV32-NEXT: slli a1, a1, 31 +; RV32-NEXT: srai a1, a1, 31 +; RV32-NEXT: xor s0, s0, t4 +; RV32-NEXT: xor s1, s1, t5 +; RV32-NEXT: xor s2, s2, t6 +; RV32-NEXT: xor a2, a2, a3 +; RV32-NEXT: xor t0, t0, a7 +; RV32-NEXT: xor t1, t1, a6 +; RV32-NEXT: xor t2, t2, a5 +; RV32-NEXT: xor t3, t3, a4 +; RV32-NEXT: and s0, s0, a1 +; RV32-NEXT: and s1, s1, a1 +; RV32-NEXT: and s2, s2, a1 +; RV32-NEXT: and a2, a2, a1 +; RV32-NEXT: and t0, t0, a1 +; RV32-NEXT: and t1, t1, a1 +; RV32-NEXT: and t2, t2, a1 +; RV32-NEXT: and a1, t3, a1 +; RV32-NEXT: xor t3, t4, s0 +; RV32-NEXT: xor t4, t5, s1 +; RV32-NEXT: xor t5, t6, s2 +; RV32-NEXT: xor a2, a3, a2 +; RV32-NEXT: xor a3, a7, t0 +; RV32-NEXT: xor a6, a6, t1 +; RV32-NEXT: xor a5, a5, t2 +; RV32-NEXT: xor a1, a4, a1 +; RV32-NEXT: sw a3, 16(a0) +; RV32-NEXT: sw a6, 20(a0) +; RV32-NEXT: sw a5, 24(a0) +; RV32-NEXT: sw a1, 28(a0) +; RV32-NEXT: sw t3, 0(a0) +; RV32-NEXT: sw t4, 4(a0) +; RV32-NEXT: sw t5, 8(a0) +; RV32-NEXT: sw a2, 12(a0) +; RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s1, 8(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s2, 4(sp) # 4-byte Folded Reload +; RV32-NEXT: .cfi_restore s0 +; RV32-NEXT: .cfi_restore s1 +; RV32-NEXT: .cfi_restore s2 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: .cfi_def_cfa_offset 0 +; RV32-NEXT: ret + %result = call <8 x i32> @llvm.ct.select.v8i32(i1 %cond, <8 x i32> %a, <8 x i32> %b) + ret <8 x i32> %result +} + ; Declare the intrinsics +declare i1 @llvm.ct.select.i1(i1, i1, i1) declare i8 @llvm.ct.select.i8(i1, i8, i8) declare i16 @llvm.ct.select.i16(i1, i16, i16) declare i32 @llvm.ct.select.i32(i1, i32, i32) declare i64 @llvm.ct.select.i64(i1, i64, i64) declare ptr @llvm.ct.select.p0(i1, ptr, ptr) +declare float @llvm.ct.select.f32(i1, float, float) +declare double @llvm.ct.select.f64(i1, double, double) + +; Vector intrinsics +declare <4 x i32> @llvm.ct.select.v4i32(i1, <4 x i32>, <4 x i32>) +declare <2 x i64> @llvm.ct.select.v2i64(i1, <2 x i64>, <2 x i64>) +declare <8 x i16> @llvm.ct.select.v8i16(i1, <8 x i16>, <8 x i16>) +declare <16 x i8> @llvm.ct.select.v16i8(i1, <16 x i8>, <16 x i8>) +declare <4 x float> @llvm.ct.select.v4f32(i1, <4 x float>, <4 x float>) +declare <2 x double> @llvm.ct.select.v2f64(i1, <2 x double>, <2 x double>) +declare <8 x i32> @llvm.ct.select.v8i32(i1, <8 x i32>, <8 x i32>) diff --git a/llvm/test/CodeGen/X86/ctselect.ll b/llvm/test/CodeGen/X86/ctselect.ll index 095787a5e2a4b..2b6091c880637 100644 --- a/llvm/test/CodeGen/X86/ctselect.ll +++ b/llvm/test/CodeGen/X86/ctselect.ll @@ -8,120 +8,75 @@ define i8 @test_ctselect_i8(i1 %cond, i8 %a, i8 %b) { ; X64-LABEL: test_ctselect_i8: ; X64: # %bb.0: -; X64-NEXT: # kill: def $edi killed $edi def $rdi -; X64-NEXT: andb $1, %dil -; X64-NEXT: leal -1(%rdi), %eax -; X64-NEXT: movl %edi, %ecx -; X64-NEXT: negb %cl -; X64-NEXT: andb %sil, %cl -; X64-NEXT: andb %dl, %al -; X64-NEXT: orb %cl, %al +; X64-NEXT: movl %edi, %eax +; X64-NEXT: xorl %edx, %esi +; X64-NEXT: andb $1, %al +; X64-NEXT: negb %al +; X64-NEXT: andb %sil, %al +; X64-NEXT: xorb %dl, %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq ; ; X32-LABEL: test_ctselect_i8: ; X32: # %bb.0: ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movzbl {{[0-9]+}}(%esp), %edx +; X32-NEXT: xorb %cl, %dl ; X32-NEXT: andb $1, %al -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: negb %cl -; X32-NEXT: andb {{[0-9]+}}(%esp), %cl -; X32-NEXT: decb %al -; X32-NEXT: andb {{[0-9]+}}(%esp), %al -; X32-NEXT: orb %cl, %al +; X32-NEXT: negb %al +; X32-NEXT: andb %dl, %al +; X32-NEXT: xorb %cl, %al ; X32-NEXT: retl ; ; X32-NOCMOV-LABEL: test_ctselect_i8: ; X32-NOCMOV: # %bb.0: ; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %edx +; X32-NOCMOV-NEXT: xorb %cl, %dl ; X32-NOCMOV-NEXT: andb $1, %al -; X32-NOCMOV-NEXT: movl %eax, %ecx -; X32-NOCMOV-NEXT: negb %cl -; X32-NOCMOV-NEXT: andb {{[0-9]+}}(%esp), %cl -; X32-NOCMOV-NEXT: decb %al -; X32-NOCMOV-NEXT: andb {{[0-9]+}}(%esp), %al -; X32-NOCMOV-NEXT: orb %cl, %al +; X32-NOCMOV-NEXT: negb %al +; X32-NOCMOV-NEXT: andb %dl, %al +; X32-NOCMOV-NEXT: xorb %cl, %al ; X32-NOCMOV-NEXT: retl %result = call i8 @llvm.ct.select.i8(i1 %cond, i8 %a, i8 %b) ret i8 %result } -define i16 @test_ctselect_i16(i1 %cond, i16 %a, i16 %b) { -; X64-LABEL: test_ctselect_i16: +define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) { +; X64-LABEL: test_ctselect_i32: ; X64: # %bb.0: -; X64-NEXT: # kill: def $edi killed $edi def $rdi -; X64-NEXT: andl $1, %edi -; X64-NEXT: leal -1(%rdi), %ecx ; X64-NEXT: movl %edi, %eax +; X64-NEXT: xorl %edx, %esi +; X64-NEXT: andl $1, %eax ; X64-NEXT: negl %eax ; X64-NEXT: andl %esi, %eax -; X64-NEXT: andl %edx, %ecx -; X64-NEXT: orl %ecx, %eax -; X64-NEXT: # kill: def $ax killed $ax killed $eax -; X64-NEXT: retq -; -; X32-LABEL: test_ctselect_i16: -; X32: # %bb.0: -; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X32-NEXT: andl $1, %eax -; X32-NEXT: leal -1(%eax), %ecx -; X32-NEXT: andw {{[0-9]+}}(%esp), %cx -; X32-NEXT: negl %eax -; X32-NEXT: andw {{[0-9]+}}(%esp), %ax -; X32-NEXT: orl %ecx, %eax -; X32-NEXT: # kill: def $ax killed $ax killed $eax -; X32-NEXT: retl -; -; X32-NOCMOV-LABEL: test_ctselect_i16: -; X32-NOCMOV: # %bb.0: -; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X32-NOCMOV-NEXT: andl $1, %eax -; X32-NOCMOV-NEXT: leal -1(%eax), %ecx -; X32-NOCMOV-NEXT: andw {{[0-9]+}}(%esp), %cx -; X32-NOCMOV-NEXT: negl %eax -; X32-NOCMOV-NEXT: andw {{[0-9]+}}(%esp), %ax -; X32-NOCMOV-NEXT: orl %ecx, %eax -; X32-NOCMOV-NEXT: # kill: def $ax killed $ax killed $eax -; X32-NOCMOV-NEXT: retl - %result = call i16 @llvm.ct.select.i16(i1 %cond, i16 %a, i16 %b) - ret i16 %result -} - -define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) { -; X64-LABEL: test_ctselect_i32: -; X64: # %bb.0: -; X64-NEXT: # kill: def $edi killed $edi def $rdi -; X64-NEXT: andl $1, %edi -; X64-NEXT: leal -1(%rdi), %eax -; X64-NEXT: movl %edi, %ecx -; X64-NEXT: negl %ecx -; X64-NEXT: andl %esi, %ecx -; X64-NEXT: andl %edx, %eax -; X64-NEXT: orl %ecx, %eax +; X64-NEXT: xorl %edx, %eax ; X64-NEXT: retq ; ; X32-LABEL: test_ctselect_i32: ; X32: # %bb.0: ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: xorl %ecx, %edx ; X32-NEXT: andl $1, %eax -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: negl %ecx -; X32-NEXT: andl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: decl %eax -; X32-NEXT: andl {{[0-9]+}}(%esp), %eax -; X32-NEXT: orl %ecx, %eax +; X32-NEXT: negl %eax +; X32-NEXT: andl %edx, %eax +; X32-NEXT: xorl %ecx, %eax ; X32-NEXT: retl ; ; X32-NOCMOV-LABEL: test_ctselect_i32: ; X32-NOCMOV: # %bb.0: ; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NOCMOV-NEXT: xorl %ecx, %edx ; X32-NOCMOV-NEXT: andl $1, %eax -; X32-NOCMOV-NEXT: movl %eax, %ecx -; X32-NOCMOV-NEXT: negl %ecx -; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %ecx -; X32-NOCMOV-NEXT: decl %eax -; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %eax -; X32-NOCMOV-NEXT: orl %ecx, %eax +; X32-NOCMOV-NEXT: negl %eax +; X32-NOCMOV-NEXT: andl %edx, %eax +; X32-NOCMOV-NEXT: xorl %ecx, %eax ; X32-NOCMOV-NEXT: retl %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) ret i32 %result @@ -130,13 +85,12 @@ define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) { define i64 @test_ctselect_i64(i1 %cond, i64 %a, i64 %b) { ; X64-LABEL: test_ctselect_i64: ; X64: # %bb.0: -; X64-NEXT: # kill: def $edi killed $edi def $rdi -; X64-NEXT: andl $1, %edi -; X64-NEXT: leaq -1(%rdi), %rax -; X64-NEXT: negq %rdi -; X64-NEXT: andq %rsi, %rdi -; X64-NEXT: andq %rdx, %rax -; X64-NEXT: orq %rdi, %rax +; X64-NEXT: movl %edi, %eax +; X64-NEXT: xorq %rdx, %rsi +; X64-NEXT: andl $1, %eax +; X64-NEXT: negq %rax +; X64-NEXT: andq %rsi, %rax +; X64-NEXT: xorq %rdx, %rax ; X64-NEXT: retq ; ; X32-LABEL: test_ctselect_i64: @@ -190,14 +144,12 @@ define float @test_ctselect_f32(i1 %cond, float %a, float %b) { ; X64-LABEL: test_ctselect_f32: ; X64: # %bb.0: ; X64-NEXT: movd %xmm1, %eax +; X64-NEXT: pxor %xmm1, %xmm0 ; X64-NEXT: movd %xmm0, %ecx ; X64-NEXT: andl $1, %edi -; X64-NEXT: movl %edi, %edx -; X64-NEXT: negl %edx -; X64-NEXT: andl %ecx, %edx -; X64-NEXT: decl %edi -; X64-NEXT: andl %eax, %edi -; X64-NEXT: orl %edx, %edi +; X64-NEXT: negl %edi +; X64-NEXT: andl %ecx, %edi +; X64-NEXT: xorl %eax, %edi ; X64-NEXT: movd %edi, %xmm0 ; X64-NEXT: retq ; @@ -206,13 +158,13 @@ define float @test_ctselect_f32(i1 %cond, float %a, float %b) { ; X32-NEXT: pushl %eax ; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: xorl %ecx, %edx ; X32-NEXT: andl $1, %eax -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: negl %ecx -; X32-NEXT: andl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: decl %eax -; X32-NEXT: andl {{[0-9]+}}(%esp), %eax -; X32-NEXT: orl %ecx, %eax +; X32-NEXT: negl %eax +; X32-NEXT: andl %edx, %eax +; X32-NEXT: xorl %ecx, %eax ; X32-NEXT: movl %eax, (%esp) ; X32-NEXT: flds (%esp) ; X32-NEXT: popl %eax @@ -224,13 +176,13 @@ define float @test_ctselect_f32(i1 %cond, float %a, float %b) { ; X32-NOCMOV-NEXT: pushl %eax ; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 ; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NOCMOV-NEXT: xorl %ecx, %edx ; X32-NOCMOV-NEXT: andl $1, %eax -; X32-NOCMOV-NEXT: movl %eax, %ecx -; X32-NOCMOV-NEXT: negl %ecx -; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %ecx -; X32-NOCMOV-NEXT: decl %eax -; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %eax -; X32-NOCMOV-NEXT: orl %ecx, %eax +; X32-NOCMOV-NEXT: negl %eax +; X32-NOCMOV-NEXT: andl %edx, %eax +; X32-NOCMOV-NEXT: xorl %ecx, %eax ; X32-NOCMOV-NEXT: movl %eax, (%esp) ; X32-NOCMOV-NEXT: flds (%esp) ; X32-NOCMOV-NEXT: popl %eax @@ -245,14 +197,12 @@ define double @test_ctselect_f64(i1 %cond, double %a, double %b) { ; X64: # %bb.0: ; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: movq %xmm1, %rax +; X64-NEXT: pxor %xmm1, %xmm0 ; X64-NEXT: movq %xmm0, %rcx ; X64-NEXT: andl $1, %edi -; X64-NEXT: movq %rdi, %rdx -; X64-NEXT: negq %rdx -; X64-NEXT: andq %rcx, %rdx -; X64-NEXT: decq %rdi -; X64-NEXT: andq %rax, %rdi -; X64-NEXT: orq %rdx, %rdi +; X64-NEXT: negq %rdi +; X64-NEXT: andq %rcx, %rdi +; X64-NEXT: xorq %rax, %rdi ; X64-NEXT: movq %rdi, %xmm0 ; X64-NEXT: retq ; @@ -320,37 +270,36 @@ define double @test_ctselect_f64(i1 %cond, double %a, double %b) { define ptr @test_ctselect_ptr(i1 %cond, ptr %a, ptr %b) { ; X64-LABEL: test_ctselect_ptr: ; X64: # %bb.0: -; X64-NEXT: # kill: def $edi killed $edi def $rdi -; X64-NEXT: andl $1, %edi -; X64-NEXT: leaq -1(%rdi), %rax -; X64-NEXT: negq %rdi -; X64-NEXT: andq %rsi, %rdi -; X64-NEXT: andq %rdx, %rax -; X64-NEXT: orq %rdi, %rax +; X64-NEXT: movl %edi, %eax +; X64-NEXT: xorq %rdx, %rsi +; X64-NEXT: andl $1, %eax +; X64-NEXT: negq %rax +; X64-NEXT: andq %rsi, %rax +; X64-NEXT: xorq %rdx, %rax ; X64-NEXT: retq ; ; X32-LABEL: test_ctselect_ptr: ; X32: # %bb.0: ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: xorl %ecx, %edx ; X32-NEXT: andl $1, %eax -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: negl %ecx -; X32-NEXT: andl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: decl %eax -; X32-NEXT: andl {{[0-9]+}}(%esp), %eax -; X32-NEXT: orl %ecx, %eax +; X32-NEXT: negl %eax +; X32-NEXT: andl %edx, %eax +; X32-NEXT: xorl %ecx, %eax ; X32-NEXT: retl ; ; X32-NOCMOV-LABEL: test_ctselect_ptr: ; X32-NOCMOV: # %bb.0: ; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NOCMOV-NEXT: xorl %ecx, %edx ; X32-NOCMOV-NEXT: andl $1, %eax -; X32-NOCMOV-NEXT: movl %eax, %ecx -; X32-NOCMOV-NEXT: negl %ecx -; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %ecx -; X32-NOCMOV-NEXT: decl %eax -; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %eax -; X32-NOCMOV-NEXT: orl %ecx, %eax +; X32-NOCMOV-NEXT: negl %eax +; X32-NOCMOV-NEXT: andl %edx, %eax +; X32-NOCMOV-NEXT: xorl %ecx, %eax ; X32-NOCMOV-NEXT: retl %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %a, ptr %b) ret ptr %result @@ -361,16 +310,24 @@ define i32 @test_ctselect_const_true(i32 %a, i32 %b) { ; X64-LABEL: test_ctselect_const_true: ; X64: # %bb.0: ; X64-NEXT: movl %edi, %eax +; X64-NEXT: xorl %esi, %eax +; X64-NEXT: xorl %esi, %eax ; X64-NEXT: retq ; ; X32-LABEL: test_ctselect_const_true: ; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: xorl %ecx, %eax +; X32-NEXT: xorl %ecx, %eax ; X32-NEXT: retl ; ; X32-NOCMOV-LABEL: test_ctselect_const_true: ; X32-NOCMOV: # %bb.0: +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NOCMOV-NEXT: xorl %ecx, %eax +; X32-NOCMOV-NEXT: xorl %ecx, %eax ; X32-NOCMOV-NEXT: retl %result = call i32 @llvm.ct.select.i32(i1 true, i32 %a, i32 %b) ret i32 %result @@ -385,13 +342,13 @@ define i32 @test_ctselect_const_false(i32 %a, i32 %b) { ; X32-LABEL: test_ctselect_const_false: ; X32: # %bb.0: ; X32-NEXT: xorl %eax, %eax -; X32-NEXT: orl {{[0-9]+}}(%esp), %eax +; X32-NEXT: xorl {{[0-9]+}}(%esp), %eax ; X32-NEXT: retl ; ; X32-NOCMOV-LABEL: test_ctselect_const_false: ; X32-NOCMOV: # %bb.0: ; X32-NOCMOV-NEXT: xorl %eax, %eax -; X32-NOCMOV-NEXT: orl {{[0-9]+}}(%esp), %eax +; X32-NOCMOV-NEXT: xorl {{[0-9]+}}(%esp), %eax ; X32-NOCMOV-NEXT: retl %result = call i32 @llvm.ct.select.i32(i1 false, i32 %a, i32 %b) ret i32 %result @@ -404,174 +361,79 @@ define i32 @test_ctselect_icmp_eq(i32 %x, i32 %y, i32 %a, i32 %b) { ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpl %esi, %edi ; X64-NEXT: sete %al -; X64-NEXT: movl %eax, %esi -; X64-NEXT: negl %esi -; X64-NEXT: andl %edx, %esi -; X64-NEXT: decl %eax -; X64-NEXT: andl %ecx, %eax -; X64-NEXT: orl %esi, %eax +; X64-NEXT: xorl %ecx, %edx +; X64-NEXT: negl %eax +; X64-NEXT: andl %edx, %eax +; X64-NEXT: xorl %ecx, %eax ; X64-NEXT: retq ; ; X32-LABEL: test_ctselect_icmp_eq: ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx ; X32-NEXT: xorl %eax, %eax -; X32-NEXT: cmpl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: cmpl {{[0-9]+}}(%esp), %edx ; X32-NEXT: sete %al -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: negl %ecx -; X32-NEXT: andl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: decl %eax -; X32-NEXT: andl {{[0-9]+}}(%esp), %eax -; X32-NEXT: orl %ecx, %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: xorl %ecx, %edx +; X32-NEXT: negl %eax +; X32-NEXT: andl %edx, %eax +; X32-NEXT: xorl %ecx, %eax ; X32-NEXT: retl ; ; X32-NOCMOV-LABEL: test_ctselect_icmp_eq: ; X32-NOCMOV: # %bb.0: ; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx ; X32-NOCMOV-NEXT: xorl %eax, %eax -; X32-NOCMOV-NEXT: cmpl {{[0-9]+}}(%esp), %ecx +; X32-NOCMOV-NEXT: cmpl {{[0-9]+}}(%esp), %edx ; X32-NOCMOV-NEXT: sete %al -; X32-NOCMOV-NEXT: movl %eax, %ecx -; X32-NOCMOV-NEXT: negl %ecx -; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %ecx -; X32-NOCMOV-NEXT: decl %eax -; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %eax -; X32-NOCMOV-NEXT: orl %ecx, %eax +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NOCMOV-NEXT: xorl %ecx, %edx +; X32-NOCMOV-NEXT: negl %eax +; X32-NOCMOV-NEXT: andl %edx, %eax +; X32-NOCMOV-NEXT: xorl %ecx, %eax ; X32-NOCMOV-NEXT: retl %cond = icmp eq i32 %x, %y %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) ret i32 %result } -define i32 @test_ctselect_icmp_ne(i32 %x, i32 %y, i32 %a, i32 %b) { -; X64-LABEL: test_ctselect_icmp_ne: -; X64: # %bb.0: -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl %esi, %edi -; X64-NEXT: setne %al -; X64-NEXT: movl %eax, %esi -; X64-NEXT: negl %esi -; X64-NEXT: andl %edx, %esi -; X64-NEXT: decl %eax -; X64-NEXT: andl %ecx, %eax -; X64-NEXT: orl %esi, %eax -; X64-NEXT: retq -; -; X32-LABEL: test_ctselect_icmp_ne: -; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: xorl %eax, %eax -; X32-NEXT: cmpl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: setne %al -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: negl %ecx -; X32-NEXT: andl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: decl %eax -; X32-NEXT: andl {{[0-9]+}}(%esp), %eax -; X32-NEXT: orl %ecx, %eax -; X32-NEXT: retl -; -; X32-NOCMOV-LABEL: test_ctselect_icmp_ne: -; X32-NOCMOV: # %bb.0: -; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NOCMOV-NEXT: xorl %eax, %eax -; X32-NOCMOV-NEXT: cmpl {{[0-9]+}}(%esp), %ecx -; X32-NOCMOV-NEXT: setne %al -; X32-NOCMOV-NEXT: movl %eax, %ecx -; X32-NOCMOV-NEXT: negl %ecx -; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %ecx -; X32-NOCMOV-NEXT: decl %eax -; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %eax -; X32-NOCMOV-NEXT: orl %ecx, %eax -; X32-NOCMOV-NEXT: retl - %cond = icmp ne i32 %x, %y - %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) - ret i32 %result -} - -define i32 @test_ctselect_icmp_slt(i32 %x, i32 %y, i32 %a, i32 %b) { -; X64-LABEL: test_ctselect_icmp_slt: -; X64: # %bb.0: -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl %esi, %edi -; X64-NEXT: setl %al -; X64-NEXT: movl %eax, %esi -; X64-NEXT: negl %esi -; X64-NEXT: andl %edx, %esi -; X64-NEXT: decl %eax -; X64-NEXT: andl %ecx, %eax -; X64-NEXT: orl %esi, %eax -; X64-NEXT: retq -; -; X32-LABEL: test_ctselect_icmp_slt: -; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: xorl %eax, %eax -; X32-NEXT: cmpl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: setl %al -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: negl %ecx -; X32-NEXT: andl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: decl %eax -; X32-NEXT: andl {{[0-9]+}}(%esp), %eax -; X32-NEXT: orl %ecx, %eax -; X32-NEXT: retl -; -; X32-NOCMOV-LABEL: test_ctselect_icmp_slt: -; X32-NOCMOV: # %bb.0: -; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NOCMOV-NEXT: xorl %eax, %eax -; X32-NOCMOV-NEXT: cmpl {{[0-9]+}}(%esp), %ecx -; X32-NOCMOV-NEXT: setl %al -; X32-NOCMOV-NEXT: movl %eax, %ecx -; X32-NOCMOV-NEXT: negl %ecx -; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %ecx -; X32-NOCMOV-NEXT: decl %eax -; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %eax -; X32-NOCMOV-NEXT: orl %ecx, %eax -; X32-NOCMOV-NEXT: retl - %cond = icmp slt i32 %x, %y - %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) - ret i32 %result -} - define i32 @test_ctselect_icmp_ult(i32 %x, i32 %y, i32 %a, i32 %b) { ; X64-LABEL: test_ctselect_icmp_ult: ; X64: # %bb.0: +; X64-NEXT: xorl %ecx, %edx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpl %esi, %edi ; X64-NEXT: sbbl %eax, %eax -; X64-NEXT: andl %eax, %edx -; X64-NEXT: notl %eax -; X64-NEXT: andl %ecx, %eax -; X64-NEXT: orl %edx, %eax +; X64-NEXT: andl %edx, %eax +; X64-NEXT: xorl %ecx, %eax ; X64-NEXT: retq ; ; X32-LABEL: test_ctselect_icmp_ult: ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: xorl %eax, %eax -; X32-NEXT: cmpl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: sbbl %eax, %eax -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: andl %eax, %ecx -; X32-NEXT: notl %eax -; X32-NEXT: andl {{[0-9]+}}(%esp), %eax -; X32-NEXT: orl %ecx, %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: xorl %edx, %edx +; X32-NEXT: cmpl {{[0-9]+}}(%esp), %eax +; X32-NEXT: sbbl %edx, %edx +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: xorl %ecx, %eax +; X32-NEXT: andl %edx, %eax +; X32-NEXT: xorl %ecx, %eax ; X32-NEXT: retl ; ; X32-NOCMOV-LABEL: test_ctselect_icmp_ult: ; X32-NOCMOV: # %bb.0: ; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NOCMOV-NEXT: xorl %eax, %eax -; X32-NOCMOV-NEXT: cmpl {{[0-9]+}}(%esp), %ecx -; X32-NOCMOV-NEXT: sbbl %eax, %eax -; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NOCMOV-NEXT: andl %eax, %ecx -; X32-NOCMOV-NEXT: notl %eax -; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %eax -; X32-NOCMOV-NEXT: orl %ecx, %eax +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NOCMOV-NEXT: xorl %edx, %edx +; X32-NOCMOV-NEXT: cmpl {{[0-9]+}}(%esp), %eax +; X32-NOCMOV-NEXT: sbbl %edx, %edx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NOCMOV-NEXT: xorl %ecx, %eax +; X32-NOCMOV-NEXT: andl %edx, %eax +; X32-NOCMOV-NEXT: xorl %ecx, %eax ; X32-NOCMOV-NEXT: retl %cond = icmp ult i32 %x, %y %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) @@ -583,12 +445,10 @@ define float @test_ctselect_fcmp_oeq(float %x, float %y, float %a, float %b) { ; X64: # %bb.0: ; X64-NEXT: movd %xmm3, %eax ; X64-NEXT: cmpeqss %xmm1, %xmm0 -; X64-NEXT: movd %xmm0, %ecx -; X64-NEXT: pand %xmm2, %xmm0 -; X64-NEXT: movd %xmm0, %edx -; X64-NEXT: notl %ecx -; X64-NEXT: andl %eax, %ecx -; X64-NEXT: orl %edx, %ecx +; X64-NEXT: pxor %xmm3, %xmm2 +; X64-NEXT: pand %xmm0, %xmm2 +; X64-NEXT: movd %xmm2, %ecx +; X64-NEXT: xorl %eax, %ecx ; X64-NEXT: movd %ecx, %xmm0 ; X64-NEXT: retq ; @@ -596,21 +456,21 @@ define float @test_ctselect_fcmp_oeq(float %x, float %y, float %a, float %b) { ; X32: # %bb.0: ; X32-NEXT: pushl %eax ; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: flds {{[0-9]+}}(%esp) ; X32-NEXT: flds {{[0-9]+}}(%esp) ; X32-NEXT: fucompi %st(1), %st ; X32-NEXT: fstp %st(0) -; X32-NEXT: setnp %al -; X32-NEXT: sete %cl -; X32-NEXT: andb %al, %cl -; X32-NEXT: movzbl %cl, %eax -; X32-NEXT: movl %eax, %ecx +; X32-NEXT: setnp %cl +; X32-NEXT: sete %dl +; X32-NEXT: andb %cl, %dl +; X32-NEXT: movzbl %dl, %ecx ; X32-NEXT: negl %ecx -; X32-NEXT: andl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: decl %eax -; X32-NEXT: andl {{[0-9]+}}(%esp), %eax -; X32-NEXT: orl %ecx, %eax -; X32-NEXT: movl %eax, (%esp) +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: xorl %eax, %edx +; X32-NEXT: andl %ecx, %edx +; X32-NEXT: xorl %eax, %edx +; X32-NEXT: movl %edx, (%esp) ; X32-NEXT: flds (%esp) ; X32-NEXT: popl %eax ; X32-NEXT: .cfi_def_cfa_offset 4 @@ -620,6 +480,7 @@ define float @test_ctselect_fcmp_oeq(float %x, float %y, float %a, float %b) { ; X32-NOCMOV: # %bb.0: ; X32-NOCMOV-NEXT: pushl %eax ; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) ; X32-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) ; X32-NOCMOV-NEXT: fucompp @@ -627,16 +488,15 @@ define float @test_ctselect_fcmp_oeq(float %x, float %y, float %a, float %b) { ; X32-NOCMOV-NEXT: # kill: def $ah killed $ah killed $ax ; X32-NOCMOV-NEXT: sahf ; X32-NOCMOV-NEXT: setnp %al -; X32-NOCMOV-NEXT: sete %cl -; X32-NOCMOV-NEXT: andb %al, %cl -; X32-NOCMOV-NEXT: movzbl %cl, %eax -; X32-NOCMOV-NEXT: movl %eax, %ecx -; X32-NOCMOV-NEXT: negl %ecx -; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %ecx -; X32-NOCMOV-NEXT: decl %eax -; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %eax -; X32-NOCMOV-NEXT: orl %ecx, %eax -; X32-NOCMOV-NEXT: movl %eax, (%esp) +; X32-NOCMOV-NEXT: sete %dl +; X32-NOCMOV-NEXT: andb %al, %dl +; X32-NOCMOV-NEXT: movzbl %dl, %eax +; X32-NOCMOV-NEXT: negl %eax +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NOCMOV-NEXT: xorl %ecx, %edx +; X32-NOCMOV-NEXT: andl %eax, %edx +; X32-NOCMOV-NEXT: xorl %ecx, %edx +; X32-NOCMOV-NEXT: movl %edx, (%esp) ; X32-NOCMOV-NEXT: flds (%esp) ; X32-NOCMOV-NEXT: popl %eax ; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4 @@ -650,52 +510,41 @@ define float @test_ctselect_fcmp_oeq(float %x, float %y, float %a, float %b) { define i32 @test_ctselect_load(i1 %cond, ptr %p1, ptr %p2) { ; X64-LABEL: test_ctselect_load: ; X64: # %bb.0: -; X64-NEXT: # kill: def $edi killed $edi def $rdi +; X64-NEXT: movl (%rdx), %ecx +; X64-NEXT: movl (%rsi), %eax +; X64-NEXT: xorl %ecx, %eax ; X64-NEXT: andl $1, %edi -; X64-NEXT: leal -1(%rdi), %eax -; X64-NEXT: movl %edi, %ecx -; X64-NEXT: negl %ecx -; X64-NEXT: andl (%rsi), %ecx -; X64-NEXT: andl (%rdx), %eax -; X64-NEXT: orl %ecx, %eax +; X64-NEXT: negl %edi +; X64-NEXT: andl %edi, %eax +; X64-NEXT: xorl %ecx, %eax ; X64-NEXT: retq ; ; X32-LABEL: test_ctselect_load: ; X32: # %bb.0: -; X32-NEXT: pushl %esi -; X32-NEXT: .cfi_def_cfa_offset 8 -; X32-NEXT: .cfi_offset %esi, -8 +; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl (%edx), %edx +; X32-NEXT: movl (%ecx), %ecx +; X32-NEXT: xorl %edx, %ecx ; X32-NEXT: andl $1, %eax -; X32-NEXT: movl %eax, %esi -; X32-NEXT: negl %esi -; X32-NEXT: andl (%edx), %esi -; X32-NEXT: decl %eax -; X32-NEXT: andl (%ecx), %eax -; X32-NEXT: orl %esi, %eax -; X32-NEXT: popl %esi -; X32-NEXT: .cfi_def_cfa_offset 4 +; X32-NEXT: negl %eax +; X32-NEXT: andl %ecx, %eax +; X32-NEXT: xorl %edx, %eax ; X32-NEXT: retl ; ; X32-NOCMOV-LABEL: test_ctselect_load: ; X32-NOCMOV: # %bb.0: -; X32-NOCMOV-NEXT: pushl %esi -; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 -; X32-NOCMOV-NEXT: .cfi_offset %esi, -8 +; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X32-NOCMOV-NEXT: movl (%edx), %edx +; X32-NOCMOV-NEXT: movl (%ecx), %ecx +; X32-NOCMOV-NEXT: xorl %edx, %ecx ; X32-NOCMOV-NEXT: andl $1, %eax -; X32-NOCMOV-NEXT: movl %eax, %esi -; X32-NOCMOV-NEXT: negl %esi -; X32-NOCMOV-NEXT: andl (%edx), %esi -; X32-NOCMOV-NEXT: decl %eax -; X32-NOCMOV-NEXT: andl (%ecx), %eax -; X32-NOCMOV-NEXT: orl %esi, %eax -; X32-NOCMOV-NEXT: popl %esi -; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4 +; X32-NOCMOV-NEXT: negl %eax +; X32-NOCMOV-NEXT: andl %ecx, %eax +; X32-NOCMOV-NEXT: xorl %edx, %eax ; X32-NOCMOV-NEXT: retl %a = load i32, ptr %p1 %b = load i32, ptr %p2 @@ -707,69 +556,753 @@ define i32 @test_ctselect_load(i1 %cond, ptr %p1, ptr %p2) { define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) { ; X64-LABEL: test_ctselect_nested: ; X64: # %bb.0: -; X64-NEXT: # kill: def $esi killed $esi def $rsi -; X64-NEXT: # kill: def $edi killed $edi def $rdi +; X64-NEXT: movl %edi, %eax +; X64-NEXT: xorl %ecx, %edx ; X64-NEXT: andl $1, %esi -; X64-NEXT: leal -1(%rsi), %r9d -; X64-NEXT: movl %esi, %eax +; X64-NEXT: negl %esi +; X64-NEXT: andl %edx, %esi +; X64-NEXT: xorl %r8d, %ecx +; X64-NEXT: xorl %esi, %ecx +; X64-NEXT: andl $1, %eax ; X64-NEXT: negl %eax -; X64-NEXT: andl %edx, %eax -; X64-NEXT: andl %ecx, %r9d -; X64-NEXT: orl %eax, %r9d -; X64-NEXT: andl $1, %edi -; X64-NEXT: leal -1(%rdi), %eax -; X64-NEXT: movl %edi, %ecx -; X64-NEXT: negl %ecx -; X64-NEXT: andl %r9d, %ecx -; X64-NEXT: andl %r8d, %eax -; X64-NEXT: orl %ecx, %eax +; X64-NEXT: andl %ecx, %eax +; X64-NEXT: xorl %r8d, %eax ; X64-NEXT: retq ; ; X32-LABEL: test_ctselect_nested: ; X32: # %bb.0: +; X32-NEXT: pushl %edi +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: pushl %esi +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: .cfi_offset %esi, -12 +; X32-NEXT: .cfi_offset %edi, -8 ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: andl $1, %ecx -; X32-NEXT: movl %ecx, %edx -; X32-NEXT: negl %edx -; X32-NEXT: andl {{[0-9]+}}(%esp), %edx -; X32-NEXT: decl %ecx -; X32-NEXT: andl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: orl %edx, %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movzbl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: movl {{[0-9]+}}(%esp), %edi +; X32-NEXT: xorl %edx, %edi +; X32-NEXT: andl $1, %esi +; X32-NEXT: negl %esi +; X32-NEXT: andl %edi, %esi +; X32-NEXT: xorl %ecx, %edx +; X32-NEXT: xorl %esi, %edx ; X32-NEXT: andl $1, %eax -; X32-NEXT: movl %eax, %edx -; X32-NEXT: negl %edx -; X32-NEXT: andl %ecx, %edx -; X32-NEXT: decl %eax -; X32-NEXT: andl {{[0-9]+}}(%esp), %eax -; X32-NEXT: orl %edx, %eax +; X32-NEXT: negl %eax +; X32-NEXT: andl %edx, %eax +; X32-NEXT: xorl %ecx, %eax +; X32-NEXT: popl %esi +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: popl %edi +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl ; ; X32-NOCMOV-LABEL: test_ctselect_nested: ; X32-NOCMOV: # %bb.0: +; X32-NOCMOV-NEXT: pushl %edi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: pushl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: .cfi_offset %esi, -12 +; X32-NOCMOV-NEXT: .cfi_offset %edi, -8 ; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X32-NOCMOV-NEXT: andl $1, %ecx -; X32-NOCMOV-NEXT: movl %ecx, %edx -; X32-NOCMOV-NEXT: negl %edx -; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %edx -; X32-NOCMOV-NEXT: decl %ecx -; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %ecx -; X32-NOCMOV-NEXT: orl %edx, %ecx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %esi +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edi +; X32-NOCMOV-NEXT: xorl %edx, %edi +; X32-NOCMOV-NEXT: andl $1, %esi +; X32-NOCMOV-NEXT: negl %esi +; X32-NOCMOV-NEXT: andl %edi, %esi +; X32-NOCMOV-NEXT: xorl %ecx, %edx +; X32-NOCMOV-NEXT: xorl %esi, %edx ; X32-NOCMOV-NEXT: andl $1, %eax -; X32-NOCMOV-NEXT: movl %eax, %edx -; X32-NOCMOV-NEXT: negl %edx -; X32-NOCMOV-NEXT: andl %ecx, %edx -; X32-NOCMOV-NEXT: decl %eax -; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %eax -; X32-NOCMOV-NEXT: orl %edx, %eax +; X32-NOCMOV-NEXT: negl %eax +; X32-NOCMOV-NEXT: andl %edx, %eax +; X32-NOCMOV-NEXT: xorl %ecx, %eax +; X32-NOCMOV-NEXT: popl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: popl %edi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4 ; X32-NOCMOV-NEXT: retl %inner = call i32 @llvm.ct.select.i32(i1 %cond2, i32 %a, i32 %b) %result = call i32 @llvm.ct.select.i32(i1 %cond1, i32 %inner, i32 %c) ret i32 %result } +; Test nested CTSELECT pattern with AND merging on i1 values +; Pattern: ctselect C0, (ctselect C1, X, Y), Y -> ctselect (C0 & C1), X, Y +; This optimization only applies when selecting between i1 values (boolean logic) +define i32 @test_ctselect_nested_and_i1_to_i32(i1 %c0, i1 %c1, i32 %x, i32 %y) { +; X64-LABEL: test_ctselect_nested_and_i1_to_i32: +; X64: # %bb.0: +; X64-NEXT: movl %edi, %eax +; X64-NEXT: andl %esi, %eax +; X64-NEXT: xorl %ecx, %edx +; X64-NEXT: andl $1, %eax +; X64-NEXT: negl %eax +; X64-NEXT: andl %edx, %eax +; X64-NEXT: xorl %ecx, %eax +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_nested_and_i1_to_i32: +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X32-NEXT: andb {{[0-9]+}}(%esp), %al +; X32-NEXT: movzbl %al, %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: xorl %ecx, %edx +; X32-NEXT: andl $1, %eax +; X32-NEXT: negl %eax +; X32-NEXT: andl %edx, %eax +; X32-NEXT: xorl %ecx, %eax +; X32-NEXT: retl +; +; X32-NOCMOV-LABEL: test_ctselect_nested_and_i1_to_i32: +; X32-NOCMOV: # %bb.0: +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X32-NOCMOV-NEXT: andb {{[0-9]+}}(%esp), %al +; X32-NOCMOV-NEXT: movzbl %al, %eax +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NOCMOV-NEXT: xorl %ecx, %edx +; X32-NOCMOV-NEXT: andl $1, %eax +; X32-NOCMOV-NEXT: negl %eax +; X32-NOCMOV-NEXT: andl %edx, %eax +; X32-NOCMOV-NEXT: xorl %ecx, %eax +; X32-NOCMOV-NEXT: retl + %inner = call i1 @llvm.ct.select.i1(i1 %c1, i1 true, i1 false) + %cond = call i1 @llvm.ct.select.i1(i1 %c0, i1 %inner, i1 false) + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %x, i32 %y) + ret i32 %result +} + +; Test nested CTSELECT pattern with OR merging on i1 values +; Pattern: ctselect C0, X, (ctselect C1, X, Y) -> ctselect (C0 | C1), X, Y +; This optimization only applies when selecting between i1 values (boolean logic) +define i32 @test_ctselect_nested_or_i1_to_i32(i1 %c0, i1 %c1, i32 %x, i32 %y) { +; X64-LABEL: test_ctselect_nested_or_i1_to_i32: +; X64: # %bb.0: +; X64-NEXT: movl %edi, %eax +; X64-NEXT: orl %esi, %eax +; X64-NEXT: xorl %ecx, %edx +; X64-NEXT: andl $1, %eax +; X64-NEXT: negl %eax +; X64-NEXT: andl %edx, %eax +; X64-NEXT: xorl %ecx, %eax +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_nested_or_i1_to_i32: +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X32-NEXT: orb {{[0-9]+}}(%esp), %al +; X32-NEXT: movzbl %al, %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: xorl %ecx, %edx +; X32-NEXT: andl $1, %eax +; X32-NEXT: negl %eax +; X32-NEXT: andl %edx, %eax +; X32-NEXT: xorl %ecx, %eax +; X32-NEXT: retl +; +; X32-NOCMOV-LABEL: test_ctselect_nested_or_i1_to_i32: +; X32-NOCMOV: # %bb.0: +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X32-NOCMOV-NEXT: orb {{[0-9]+}}(%esp), %al +; X32-NOCMOV-NEXT: movzbl %al, %eax +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NOCMOV-NEXT: xorl %ecx, %edx +; X32-NOCMOV-NEXT: andl $1, %eax +; X32-NOCMOV-NEXT: negl %eax +; X32-NOCMOV-NEXT: andl %edx, %eax +; X32-NOCMOV-NEXT: xorl %ecx, %eax +; X32-NOCMOV-NEXT: retl + %inner = call i1 @llvm.ct.select.i1(i1 %c1, i1 true, i1 false) + %cond = call i1 @llvm.ct.select.i1(i1 %c0, i1 true, i1 %inner) + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %x, i32 %y) + ret i32 %result +} + +; Test double nested CTSELECT with recursive AND merging +; Pattern: ctselect C0, (ctselect C1, (ctselect C2, X, Y), Y), Y +; -> ctselect C0, (ctselect (C1 & C2), X, Y), Y +; -> ctselect (C0 & (C1 & C2)), X, Y +; This tests that the optimization can be applied recursively +define i32 @test_ctselect_double_nested_and_i1(i1 %c0, i1 %c1, i1 %c2, i32 %x, i32 %y) { +; X64-LABEL: test_ctselect_double_nested_and_i1: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %eax +; X64-NEXT: andl %edx, %eax +; X64-NEXT: andl %edi, %eax +; X64-NEXT: xorl %r8d, %ecx +; X64-NEXT: andl $1, %eax +; X64-NEXT: negl %eax +; X64-NEXT: andl %ecx, %eax +; X64-NEXT: xorl %r8d, %eax +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_double_nested_and_i1: +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X32-NEXT: andb {{[0-9]+}}(%esp), %al +; X32-NEXT: andb {{[0-9]+}}(%esp), %al +; X32-NEXT: movzbl %al, %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: xorl %ecx, %edx +; X32-NEXT: andl $1, %eax +; X32-NEXT: negl %eax +; X32-NEXT: andl %edx, %eax +; X32-NEXT: xorl %ecx, %eax +; X32-NEXT: retl +; +; X32-NOCMOV-LABEL: test_ctselect_double_nested_and_i1: +; X32-NOCMOV: # %bb.0: +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X32-NOCMOV-NEXT: andb {{[0-9]+}}(%esp), %al +; X32-NOCMOV-NEXT: andb {{[0-9]+}}(%esp), %al +; X32-NOCMOV-NEXT: movzbl %al, %eax +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NOCMOV-NEXT: xorl %ecx, %edx +; X32-NOCMOV-NEXT: andl $1, %eax +; X32-NOCMOV-NEXT: negl %eax +; X32-NOCMOV-NEXT: andl %edx, %eax +; X32-NOCMOV-NEXT: xorl %ecx, %eax +; X32-NOCMOV-NEXT: retl + %inner2 = call i1 @llvm.ct.select.i1(i1 %c2, i1 true, i1 false) + %inner1 = call i1 @llvm.ct.select.i1(i1 %c1, i1 %inner2, i1 false) + %cond = call i1 @llvm.ct.select.i1(i1 %c0, i1 %inner1, i1 false) + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %x, i32 %y) + ret i32 %result +} + +; Vector CTSELECT Tests +; ============================================================================ + +; Test vector CTSELECT with v4i32 (128-bit vector with single i1 mask) +; NOW CONSTANT-TIME: Uses bitwise XOR/AND operations instead of branches! +define <4 x i32> @test_ctselect_v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) { +; X64-LABEL: test_ctselect_v4i32: +; X64: # %bb.0: +; X64-NEXT: pxor %xmm1, %xmm0 +; X64-NEXT: movd %edi, %xmm2 +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; X64-NEXT: pslld $31, %xmm2 +; X64-NEXT: psrad $31, %xmm2 +; X64-NEXT: pand %xmm2, %xmm0 +; X64-NEXT: pxor %xmm1, %xmm0 +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_v4i32: +; X32: # %bb.0: +; X32-NEXT: pushl %ebp +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: pushl %ebx +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: pushl %edi +; X32-NEXT: .cfi_def_cfa_offset 16 +; X32-NEXT: pushl %esi +; X32-NEXT: .cfi_def_cfa_offset 20 +; X32-NEXT: .cfi_offset %esi, -20 +; X32-NEXT: .cfi_offset %edi, -16 +; X32-NEXT: .cfi_offset %ebx, -12 +; X32-NEXT: .cfi_offset %ebp, -8 +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: xorl %ebx, %edx +; X32-NEXT: movzbl {{[0-9]+}}(%esp), %edi +; X32-NEXT: andl $1, %edi +; X32-NEXT: negl %edi +; X32-NEXT: andl %edi, %edx +; X32-NEXT: xorl %ebx, %edx +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X32-NEXT: xorl %ebp, %ebx +; X32-NEXT: andl %edi, %ebx +; X32-NEXT: xorl %ebp, %ebx +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X32-NEXT: xorl %esi, %ebp +; X32-NEXT: andl %edi, %ebp +; X32-NEXT: xorl %esi, %ebp +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: xorl %ecx, %esi +; X32-NEXT: andl %edi, %esi +; X32-NEXT: xorl %ecx, %esi +; X32-NEXT: movl %esi, 12(%eax) +; X32-NEXT: movl %ebp, 8(%eax) +; X32-NEXT: movl %ebx, 4(%eax) +; X32-NEXT: movl %edx, (%eax) +; X32-NEXT: popl %esi +; X32-NEXT: .cfi_def_cfa_offset 16 +; X32-NEXT: popl %edi +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: popl %ebx +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: popl %ebp +; X32-NEXT: .cfi_def_cfa_offset 4 +; X32-NEXT: retl $4 +; +; X32-NOCMOV-LABEL: test_ctselect_v4i32: +; X32-NOCMOV: # %bb.0: +; X32-NOCMOV-NEXT: pushl %ebp +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: pushl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: pushl %edi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 16 +; X32-NOCMOV-NEXT: pushl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 20 +; X32-NOCMOV-NEXT: .cfi_offset %esi, -20 +; X32-NOCMOV-NEXT: .cfi_offset %edi, -16 +; X32-NOCMOV-NEXT: .cfi_offset %ebx, -12 +; X32-NOCMOV-NEXT: .cfi_offset %ebp, -8 +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NOCMOV-NEXT: xorl %ebx, %edx +; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %edi +; X32-NOCMOV-NEXT: andl $1, %edi +; X32-NOCMOV-NEXT: negl %edi +; X32-NOCMOV-NEXT: andl %edi, %edx +; X32-NOCMOV-NEXT: xorl %ebx, %edx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X32-NOCMOV-NEXT: xorl %ebp, %ebx +; X32-NOCMOV-NEXT: andl %edi, %ebx +; X32-NOCMOV-NEXT: xorl %ebp, %ebx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X32-NOCMOV-NEXT: xorl %esi, %ebp +; X32-NOCMOV-NEXT: andl %edi, %ebp +; X32-NOCMOV-NEXT: xorl %esi, %ebp +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NOCMOV-NEXT: xorl %ecx, %esi +; X32-NOCMOV-NEXT: andl %edi, %esi +; X32-NOCMOV-NEXT: xorl %ecx, %esi +; X32-NOCMOV-NEXT: movl %esi, 12(%eax) +; X32-NOCMOV-NEXT: movl %ebp, 8(%eax) +; X32-NOCMOV-NEXT: movl %ebx, 4(%eax) +; X32-NOCMOV-NEXT: movl %edx, (%eax) +; X32-NOCMOV-NEXT: popl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 16 +; X32-NOCMOV-NEXT: popl %edi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: popl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: popl %ebp +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4 +; X32-NOCMOV-NEXT: retl $4 + %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %result +} +define <4 x float> @test_ctselect_v4f32(i1 %cond, <4 x float> %a, <4 x float> %b) { +; X64-LABEL: test_ctselect_v4f32: +; X64: # %bb.0: +; X64-NEXT: pxor %xmm1, %xmm0 +; X64-NEXT: movd %edi, %xmm2 +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; X64-NEXT: pslld $31, %xmm2 +; X64-NEXT: psrad $31, %xmm2 +; X64-NEXT: pand %xmm2, %xmm0 +; X64-NEXT: pxor %xmm1, %xmm0 +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_v4f32: +; X32: # %bb.0: +; X32-NEXT: pushl %ebp +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: pushl %ebx +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: pushl %edi +; X32-NEXT: .cfi_def_cfa_offset 16 +; X32-NEXT: pushl %esi +; X32-NEXT: .cfi_def_cfa_offset 20 +; X32-NEXT: .cfi_offset %esi, -20 +; X32-NEXT: .cfi_offset %edi, -16 +; X32-NEXT: .cfi_offset %ebx, -12 +; X32-NEXT: .cfi_offset %ebp, -8 +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: xorl %ebx, %edx +; X32-NEXT: movzbl {{[0-9]+}}(%esp), %edi +; X32-NEXT: andl $1, %edi +; X32-NEXT: negl %edi +; X32-NEXT: andl %edi, %edx +; X32-NEXT: xorl %ebx, %edx +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X32-NEXT: xorl %ebp, %ebx +; X32-NEXT: andl %edi, %ebx +; X32-NEXT: xorl %ebp, %ebx +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X32-NEXT: xorl %esi, %ebp +; X32-NEXT: andl %edi, %ebp +; X32-NEXT: xorl %esi, %ebp +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: xorl %ecx, %esi +; X32-NEXT: andl %edi, %esi +; X32-NEXT: xorl %ecx, %esi +; X32-NEXT: movl %esi, 12(%eax) +; X32-NEXT: movl %ebp, 8(%eax) +; X32-NEXT: movl %ebx, 4(%eax) +; X32-NEXT: movl %edx, (%eax) +; X32-NEXT: popl %esi +; X32-NEXT: .cfi_def_cfa_offset 16 +; X32-NEXT: popl %edi +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: popl %ebx +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: popl %ebp +; X32-NEXT: .cfi_def_cfa_offset 4 +; X32-NEXT: retl $4 +; +; X32-NOCMOV-LABEL: test_ctselect_v4f32: +; X32-NOCMOV: # %bb.0: +; X32-NOCMOV-NEXT: pushl %ebp +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: pushl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: pushl %edi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 16 +; X32-NOCMOV-NEXT: pushl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 20 +; X32-NOCMOV-NEXT: .cfi_offset %esi, -20 +; X32-NOCMOV-NEXT: .cfi_offset %edi, -16 +; X32-NOCMOV-NEXT: .cfi_offset %ebx, -12 +; X32-NOCMOV-NEXT: .cfi_offset %ebp, -8 +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NOCMOV-NEXT: xorl %ebx, %edx +; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %edi +; X32-NOCMOV-NEXT: andl $1, %edi +; X32-NOCMOV-NEXT: negl %edi +; X32-NOCMOV-NEXT: andl %edi, %edx +; X32-NOCMOV-NEXT: xorl %ebx, %edx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X32-NOCMOV-NEXT: xorl %ebp, %ebx +; X32-NOCMOV-NEXT: andl %edi, %ebx +; X32-NOCMOV-NEXT: xorl %ebp, %ebx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X32-NOCMOV-NEXT: xorl %esi, %ebp +; X32-NOCMOV-NEXT: andl %edi, %ebp +; X32-NOCMOV-NEXT: xorl %esi, %ebp +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NOCMOV-NEXT: xorl %ecx, %esi +; X32-NOCMOV-NEXT: andl %edi, %esi +; X32-NOCMOV-NEXT: xorl %ecx, %esi +; X32-NOCMOV-NEXT: movl %esi, 12(%eax) +; X32-NOCMOV-NEXT: movl %ebp, 8(%eax) +; X32-NOCMOV-NEXT: movl %ebx, 4(%eax) +; X32-NOCMOV-NEXT: movl %edx, (%eax) +; X32-NOCMOV-NEXT: popl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 16 +; X32-NOCMOV-NEXT: popl %edi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: popl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: popl %ebp +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4 +; X32-NOCMOV-NEXT: retl $4 + %result = call <4 x float> @llvm.ct.select.v4f32(i1 %cond, <4 x float> %a, <4 x float> %b) + ret <4 x float> %result +} + +define <8 x i32> @test_ctselect_v8i32_avx(i1 %cond, <8 x i32> %a, <8 x i32> %b) { +; X64-LABEL: test_ctselect_v8i32_avx: +; X64: # %bb.0: +; X64-NEXT: movd %edi, %xmm4 +; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] +; X64-NEXT: pslld $31, %xmm4 +; X64-NEXT: psrad $31, %xmm4 +; X64-NEXT: movdqa %xmm4, %xmm5 +; X64-NEXT: pandn %xmm2, %xmm5 +; X64-NEXT: pand %xmm4, %xmm0 +; X64-NEXT: por %xmm5, %xmm0 +; X64-NEXT: pand %xmm4, %xmm1 +; X64-NEXT: pandn %xmm3, %xmm4 +; X64-NEXT: por %xmm4, %xmm1 +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_v8i32_avx: +; X32: # %bb.0: +; X32-NEXT: pushl %ebp +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: pushl %ebx +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: pushl %edi +; X32-NEXT: .cfi_def_cfa_offset 16 +; X32-NEXT: pushl %esi +; X32-NEXT: .cfi_def_cfa_offset 20 +; X32-NEXT: subl $8, %esp +; X32-NEXT: .cfi_def_cfa_offset 28 +; X32-NEXT: .cfi_offset %esi, -20 +; X32-NEXT: .cfi_offset %edi, -16 +; X32-NEXT: .cfi_offset %ebx, -12 +; X32-NEXT: .cfi_offset %ebp, -8 +; X32-NEXT: movl {{[0-9]+}}(%esp), %edi +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: xorl %eax, %ecx +; X32-NEXT: movzbl {{[0-9]+}}(%esp), %edx +; X32-NEXT: andl $1, %edx +; X32-NEXT: negl %edx +; X32-NEXT: andl %edx, %ecx +; X32-NEXT: xorl %eax, %ecx +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: xorl %esi, %eax +; X32-NEXT: andl %edx, %eax +; X32-NEXT: xorl %esi, %eax +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: xorl %ebx, %esi +; X32-NEXT: andl %edx, %esi +; X32-NEXT: xorl %ebx, %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X32-NEXT: xorl %ebp, %ebx +; X32-NEXT: andl %edx, %ebx +; X32-NEXT: xorl %ebp, %ebx +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X32-NEXT: xorl %edi, %ebp +; X32-NEXT: andl %edx, %ebp +; X32-NEXT: xorl %edi, %ebp +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %edi +; X32-NEXT: xorl %eax, %edi +; X32-NEXT: andl %edx, %edi +; X32-NEXT: xorl %eax, %edi +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: xorl %eax, %ecx +; X32-NEXT: andl %edx, %ecx +; X32-NEXT: xorl %eax, %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: xorl {{[0-9]+}}(%esp), %eax +; X32-NEXT: andl %edx, %eax +; X32-NEXT: xorl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: movl %eax, 28(%edx) +; X32-NEXT: movl %ecx, 24(%edx) +; X32-NEXT: movl %edi, 20(%edx) +; X32-NEXT: movl %ebp, 16(%edx) +; X32-NEXT: movl %ebx, 12(%edx) +; X32-NEXT: movl %esi, 8(%edx) +; X32-NEXT: movl (%esp), %eax # 4-byte Reload +; X32-NEXT: movl %eax, 4(%edx) +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: movl %eax, (%edx) +; X32-NEXT: movl %edx, %eax +; X32-NEXT: addl $8, %esp +; X32-NEXT: .cfi_def_cfa_offset 20 +; X32-NEXT: popl %esi +; X32-NEXT: .cfi_def_cfa_offset 16 +; X32-NEXT: popl %edi +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: popl %ebx +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: popl %ebp +; X32-NEXT: .cfi_def_cfa_offset 4 +; X32-NEXT: retl $4 +; +; X32-NOCMOV-LABEL: test_ctselect_v8i32_avx: +; X32-NOCMOV: # %bb.0: +; X32-NOCMOV-NEXT: pushl %ebp +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: pushl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: pushl %edi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 16 +; X32-NOCMOV-NEXT: pushl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 20 +; X32-NOCMOV-NEXT: subl $8, %esp +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 28 +; X32-NOCMOV-NEXT: .cfi_offset %esi, -20 +; X32-NOCMOV-NEXT: .cfi_offset %edi, -16 +; X32-NOCMOV-NEXT: .cfi_offset %ebx, -12 +; X32-NOCMOV-NEXT: .cfi_offset %ebp, -8 +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edi +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NOCMOV-NEXT: xorl %eax, %ecx +; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %edx +; X32-NOCMOV-NEXT: andl $1, %edx +; X32-NOCMOV-NEXT: negl %edx +; X32-NOCMOV-NEXT: andl %edx, %ecx +; X32-NOCMOV-NEXT: xorl %eax, %ecx +; X32-NOCMOV-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NOCMOV-NEXT: xorl %esi, %eax +; X32-NOCMOV-NEXT: andl %edx, %eax +; X32-NOCMOV-NEXT: xorl %esi, %eax +; X32-NOCMOV-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NOCMOV-NEXT: xorl %ebx, %esi +; X32-NOCMOV-NEXT: andl %edx, %esi +; X32-NOCMOV-NEXT: xorl %ebx, %esi +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X32-NOCMOV-NEXT: xorl %ebp, %ebx +; X32-NOCMOV-NEXT: andl %edx, %ebx +; X32-NOCMOV-NEXT: xorl %ebp, %ebx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X32-NOCMOV-NEXT: xorl %edi, %ebp +; X32-NOCMOV-NEXT: andl %edx, %ebp +; X32-NOCMOV-NEXT: xorl %edi, %ebp +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edi +; X32-NOCMOV-NEXT: xorl %eax, %edi +; X32-NOCMOV-NEXT: andl %edx, %edi +; X32-NOCMOV-NEXT: xorl %eax, %edi +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NOCMOV-NEXT: xorl %eax, %ecx +; X32-NOCMOV-NEXT: andl %edx, %ecx +; X32-NOCMOV-NEXT: xorl %eax, %ecx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NOCMOV-NEXT: xorl {{[0-9]+}}(%esp), %eax +; X32-NOCMOV-NEXT: andl %edx, %eax +; X32-NOCMOV-NEXT: xorl {{[0-9]+}}(%esp), %eax +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NOCMOV-NEXT: movl %eax, 28(%edx) +; X32-NOCMOV-NEXT: movl %ecx, 24(%edx) +; X32-NOCMOV-NEXT: movl %edi, 20(%edx) +; X32-NOCMOV-NEXT: movl %ebp, 16(%edx) +; X32-NOCMOV-NEXT: movl %ebx, 12(%edx) +; X32-NOCMOV-NEXT: movl %esi, 8(%edx) +; X32-NOCMOV-NEXT: movl (%esp), %eax # 4-byte Reload +; X32-NOCMOV-NEXT: movl %eax, 4(%edx) +; X32-NOCMOV-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NOCMOV-NEXT: movl %eax, (%edx) +; X32-NOCMOV-NEXT: movl %edx, %eax +; X32-NOCMOV-NEXT: addl $8, %esp +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 20 +; X32-NOCMOV-NEXT: popl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 16 +; X32-NOCMOV-NEXT: popl %edi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: popl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: popl %ebp +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4 +; X32-NOCMOV-NEXT: retl $4 + %result = call <8 x i32> @llvm.ct.select.v8i32(i1 %cond, <8 x i32> %a, <8 x i32> %b) + ret <8 x i32> %result +} + +define float @test_ctselect_f32_nan_inf(i1 %cond) { +; X64-LABEL: test_ctselect_f32_nan_inf: +; X64: # %bb.0: +; X64-NEXT: andl $1, %edi +; X64-NEXT: negl %edi +; X64-NEXT: andl $4194304, %edi # imm = 0x400000 +; X64-NEXT: xorl $2139095040, %edi # imm = 0x7F800000 +; X64-NEXT: movd %edi, %xmm0 +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_f32_nan_inf: +; X32: # %bb.0: +; X32-NEXT: pushl %eax +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X32-NEXT: andl $1, %eax +; X32-NEXT: negl %eax +; X32-NEXT: andl $4194304, %eax # imm = 0x400000 +; X32-NEXT: xorl $2139095040, %eax # imm = 0x7F800000 +; X32-NEXT: movl %eax, (%esp) +; X32-NEXT: flds (%esp) +; X32-NEXT: popl %eax +; X32-NEXT: .cfi_def_cfa_offset 4 +; X32-NEXT: retl +; +; X32-NOCMOV-LABEL: test_ctselect_f32_nan_inf: +; X32-NOCMOV: # %bb.0: +; X32-NOCMOV-NEXT: pushl %eax +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X32-NOCMOV-NEXT: andl $1, %eax +; X32-NOCMOV-NEXT: negl %eax +; X32-NOCMOV-NEXT: andl $4194304, %eax # imm = 0x400000 +; X32-NOCMOV-NEXT: xorl $2139095040, %eax # imm = 0x7F800000 +; X32-NOCMOV-NEXT: movl %eax, (%esp) +; X32-NOCMOV-NEXT: flds (%esp) +; X32-NOCMOV-NEXT: popl %eax +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4 +; X32-NOCMOV-NEXT: retl + %result = call float @llvm.ct.select.f32(i1 %cond, float 0x7FF8000000000000, float 0x7FF0000000000000) + ret float %result +} + +define double @test_ctselect_f64_nan_inf(i1 %cond) { +; X64-LABEL: test_ctselect_f64_nan_inf: +; X64: # %bb.0: +; X64-NEXT: # kill: def $edi killed $edi def $rdi +; X64-NEXT: andl $1, %edi +; X64-NEXT: negq %rdi +; X64-NEXT: movabsq $2251799813685248, %rax # imm = 0x8000000000000 +; X64-NEXT: andq %rdi, %rax +; X64-NEXT: movabsq $9218868437227405312, %rcx # imm = 0x7FF0000000000000 +; X64-NEXT: xorq %rax, %rcx +; X64-NEXT: movq %rcx, %xmm0 +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_f64_nan_inf: +; X32: # %bb.0: +; X32-NEXT: subl $12, %esp +; X32-NEXT: .cfi_def_cfa_offset 16 +; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X32-NEXT: andl $1, %eax +; X32-NEXT: negl %eax +; X32-NEXT: andl $524288, %eax # imm = 0x80000 +; X32-NEXT: orl $2146435072, %eax # imm = 0x7FF00000 +; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X32-NEXT: movl $0, (%esp) +; X32-NEXT: fldl (%esp) +; X32-NEXT: addl $12, %esp +; X32-NEXT: .cfi_def_cfa_offset 4 +; X32-NEXT: retl +; +; X32-NOCMOV-LABEL: test_ctselect_f64_nan_inf: +; X32-NOCMOV: # %bb.0: +; X32-NOCMOV-NEXT: subl $12, %esp +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 16 +; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X32-NOCMOV-NEXT: andl $1, %eax +; X32-NOCMOV-NEXT: negl %eax +; X32-NOCMOV-NEXT: andl $524288, %eax # imm = 0x80000 +; X32-NOCMOV-NEXT: orl $2146435072, %eax # imm = 0x7FF00000 +; X32-NOCMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X32-NOCMOV-NEXT: movl $0, (%esp) +; X32-NOCMOV-NEXT: fldl (%esp) +; X32-NOCMOV-NEXT: addl $12, %esp +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4 +; X32-NOCMOV-NEXT: retl + %result = call double @llvm.ct.select.f64(i1 %cond, double 0x7FF8000000000000, double 0x7FF0000000000000) + ret double %result +} + ; Declare the intrinsics +declare i1 @llvm.ct.select.i1(i1, i1, i1) declare i8 @llvm.ct.select.i8(i1, i8, i8) declare i16 @llvm.ct.select.i16(i1, i16, i16) declare i32 @llvm.ct.select.i32(i1, i32, i32) @@ -777,3 +1310,12 @@ declare i64 @llvm.ct.select.i64(i1, i64, i64) declare float @llvm.ct.select.f32(i1, float, float) declare double @llvm.ct.select.f64(i1, double, double) declare ptr @llvm.ct.select.p0(i1, ptr, ptr) + +; Vector intrinsics +declare <4 x i32> @llvm.ct.select.v4i32(i1, <4 x i32>, <4 x i32>) +declare <2 x i64> @llvm.ct.select.v2i64(i1, <2 x i64>, <2 x i64>) +declare <8 x i16> @llvm.ct.select.v8i16(i1, <8 x i16>, <8 x i16>) +declare <16 x i8> @llvm.ct.select.v16i8(i1, <16 x i8>, <16 x i8>) +declare <4 x float> @llvm.ct.select.v4f32(i1, <4 x float>, <4 x float>) +declare <2 x double> @llvm.ct.select.v2f64(i1, <2 x double>, <2 x double>) +declare <8 x i32> @llvm.ct.select.v8i32(i1, <8 x i32>, <8 x i32>) _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
