[llvm-branch-commits] [llvm] [LLVM][ConstantTime] Strengthen constant-time handling of CTSELECT in DAG combine and legalization (PR #180883)

Akshay K via llvm-branch-commits Tue, 10 Feb 2026 20:54:07 -0800

https://github.com/kumarak updated 
https://github.com/llvm/llvm-project/pull/180883


>From e485e21f06a9a0d7650e5bdc093b423b802d39b1 Mon Sep 17 00:00:00 2001
From: kumarak <[email protected]>
Date: Tue, 10 Feb 2026 23:28:50 +0000
Subject: [PATCH] [CodeGen]  Preserve constant-time semantics in nested
 CTSELECT DAG combines

- Fix DAG combine to preserve constant-time properties in nested CTSELECT 
patterns
- Improve legalization handling for CTSELECT with AND/OR merging optimizations
- Remove redundant tests from X86/RISCV and add new tests for vector types
- Maintain all critical code path coverage while reducing test maintenance 
burden
---
 llvm/include/llvm/CodeGen/SelectionDAGNodes.h |    2 +-
 llvm/include/llvm/CodeGen/TargetLowering.h    |    2 +-
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  108 +-
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp |   56 +-
 .../SelectionDAG/LegalizeFloatTypes.cpp       |   18 +-
 .../SelectionDAG/LegalizeIntegerTypes.cpp     |    2 +-
 llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h |    2 +
 .../SelectionDAG/LegalizeTypesGeneric.cpp     |   14 +-
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp |    1 -
 llvm/test/CodeGen/RISCV/ctselect-fallback.ll  |  698 +++++++--
 llvm/test/CodeGen/X86/ctselect.ll             | 1268 ++++++++++++-----
 11 files changed, 1571 insertions(+), 600 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h 
b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
index fdb76a93bc5bb..aa72e81b2ab54 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -435,7 +435,6 @@ struct SDNodeFlags {
                             NonNeg | NoNaNs | NoInfs | SameSign | InBounds,
     FastMathFlags = NoNaNs | NoInfs | NoSignedZeros | AllowReciprocal |
                     AllowContract | ApproximateFuncs | AllowReassociation,
-
   };
 
   /// Default constructor turns off all optimization flags.
@@ -487,6 +486,7 @@ struct SDNodeFlags {
   bool hasNoFPExcept() const { return Flags & NoFPExcept; }
   bool hasUnpredictable() const { return Flags & Unpredictable; }
   bool hasInBounds() const { return Flags & InBounds; }
+
   bool operator==(const SDNodeFlags &Other) const {
     return Flags == Other.Flags;
   }
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h 
b/llvm/include/llvm/CodeGen/TargetLowering.h
index 084e08e76bd2e..724a69bd26861 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -245,7 +245,7 @@ class LLVM_ABI TargetLoweringBase {
     ScalarValSelect,     // The target supports scalar selects (ex: cmov).
     ScalarCondVectorVal, // The target supports selects with a scalar condition
                          // and vector values (ex: cmov).
-    VectorMaskSelect,    // The target supports vector selects with a vector
+    VectorMaskSelect     // The target supports vector selects with a vector
                          // mask (ex: x86 blends).
   };
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp 
b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 646bc5e78c051..620a79727278b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -6050,7 +6050,6 @@ static SDValue isSaturatingMinMax(SDValue N0, SDValue N1, 
SDValue N2,
     N0CC = cast<CondCodeSDNode>(N0.getOperand(4))->get();
     break;
   case ISD::SELECT:
-  case ISD::CTSELECT:
   case ISD::VSELECT:
     if (N0.getOperand(0).getOpcode() != ISD::SETCC)
       return SDValue();
@@ -12219,8 +12218,7 @@ template <class MatchContextClass>
 static SDValue foldBoolSelectToLogic(SDNode *N, const SDLoc &DL,
                                      SelectionDAG &DAG) {
   assert((N->getOpcode() == ISD::SELECT || N->getOpcode() == ISD::VSELECT ||
-          N->getOpcode() == ISD::VP_SELECT ||
-          N->getOpcode() == ISD::CTSELECT) &&
+          N->getOpcode() == ISD::VP_SELECT) &&
          "Expected a (v)(vp.)(ct) select");
   SDValue Cond = N->getOperand(0);
   SDValue T = N->getOperand(1), F = N->getOperand(2);
@@ -12583,6 +12581,12 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
   return SDValue();
 }
 
+// Keep CTSELECT combines deliberately conservative to preserve constant-time
+// intent across generic DAG combines. We only accept:
+//  - canonicalization of negated conditions (flip true/false operands), and
+//  - i1 CTSELECT nesting merges via AND/OR that keep the result as CTSELECT.
+// Broader rewrites should be done in target-specific lowering when stronger
+// guarantees about legality and constant-time preservation are available.
 SDValue DAGCombiner::visitConstantTimeSelect(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -12592,10 +12596,10 @@ SDValue DAGCombiner::visitConstantTimeSelect(SDNode 
*N) {
   SDLoc DL(N);
   SDNodeFlags Flags = N->getFlags();
 
-  if (SDValue V = foldBoolSelectToLogic<EmptyMatchContext>(N, DL, DAG))
-    return V;
-
   // ctselect (not Cond), N1, N2 -> ctselect Cond, N2, N1
+  // This is a CT-safe canonicalization: flip negated condition by swapping
+  // arms. extractBooleanFlip only matches boolean xor-with-1, so this 
preserves
+  // dataflow semantics and does not introduce data-dependent control flow.
   if (SDValue F = extractBooleanFlip(N0, DAG, TLI, false)) {
     SDValue SelectOp = DAG.getNode(ISD::CTSELECT, DL, VT, F, N2, N1);
     SelectOp->setFlags(Flags);
@@ -12603,82 +12607,46 @@ SDValue DAGCombiner::visitConstantTimeSelect(SDNode 
*N) {
   }
 
   if (VT0 == MVT::i1) {
-    // The code in this block deals with the following 2 equivalences:
-    //    select(C0|C1, x, y) <=> select(C0, x, select(C1, x, y))
-    //    select(C0&C1, x, y) <=> select(C0, select(C1, x, y), y)
-    // The target can specify its preferred form with the
-    // shouldNormalizeToSelectSequence() callback. However we always transform
-    // to the right anyway if we find the inner select exists in the DAG anyway
-    // and we always transform to the left side if we know that we can further
-    // optimize the combination of the conditions.
-    bool normalizeToSequence =
-        TLI.shouldNormalizeToSelectSequence(*DAG.getContext(), VT);
-    // ctselect (and Cond0, Cond1), X, Y
-    //   -> ctselect Cond0, (ctselect Cond1, X, Y), Y
-    if (N0->getOpcode() == ISD::AND && N0->hasOneUse()) {
-      SDValue Cond0 = N0->getOperand(0);
-      SDValue Cond1 = N0->getOperand(1);
-      SDValue InnerSelect = DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(),
-                                        Cond1, N1, N2, Flags);
-      if (normalizeToSequence || !InnerSelect.use_empty())
-        return DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(), Cond0,
-                           InnerSelect, N2, Flags);
-      // Cleanup on failure.
-      if (InnerSelect.use_empty())
-        recursivelyDeleteUnusedNodes(InnerSelect.getNode());
-    }
-    // ctselect (or Cond0, Cond1), X, Y -> ctselect Cond0, X, (ctselect Cond1,
-    // X, Y)
-    if (N0->getOpcode() == ISD::OR && N0->hasOneUse()) {
-      SDValue Cond0 = N0->getOperand(0);
-      SDValue Cond1 = N0->getOperand(1);
-      SDValue InnerSelect = DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(),
-                                        Cond1, N1, N2, Flags);
-      if (normalizeToSequence || !InnerSelect.use_empty())
-        return DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(), Cond0, N1,
-                           InnerSelect, Flags);
-      // Cleanup on failure.
-      if (InnerSelect.use_empty())
-        recursivelyDeleteUnusedNodes(InnerSelect.getNode());
-    }
-
-    // ctselect Cond0, (ctselect Cond1, X, Y), Y -> ctselect (and Cond0, 
Cond1),
-    // X, Y
+    // Nested CTSELECT merging optimizations for i1 conditions.
+    // These are CT-safe because:
+    //   1. AND/OR are bitwise operations that execute in constant time
+    //   2. The optimization combines two sequential CTSELECTs into one,
+    //   reducing
+    //      the total number of constant-time operations without changing
+    //      semantics
+    //   3. No data-dependent branches or memory accesses are introduced
+    //
+    // Pattern 1: ctselect C0, (ctselect C1, X, Y), Y -> ctselect (C0 & C1), X,
+    // Y
+    //   Semantic equivalence: If C0 is true, evaluate inner select (C1 ? X :
+    //   Y). If C0 is false, choose Y. This is equivalent to (C0 && C1) ? X : 
Y.
     if (N1->getOpcode() == ISD::CTSELECT && N1->hasOneUse()) {
       SDValue N1_0 = N1->getOperand(0);
       SDValue N1_1 = N1->getOperand(1);
       SDValue N1_2 = N1->getOperand(2);
       if (N1_2 == N2 && N0.getValueType() == N1_0.getValueType()) {
-        // Create the actual and node if we can generate good code for it.
-        if (!normalizeToSequence) {
-          SDValue And = DAG.getNode(ISD::AND, DL, N0.getValueType(), N0, N1_0);
-          return DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(), And, N1_1,
-                             N2, Flags);
-        }
-        // Otherwise see if we can optimize the "and" to a better pattern.
-        if (SDValue Combined = visitANDLike(N0, N1_0, N)) {
-          return DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(), Combined,
-                             N1_1, N2, Flags);
-        }
+        SDValue And = DAG.getNode(ISD::AND, DL, N0.getValueType(), N0, N1_0);
+        SDValue SelectOp =
+            DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(), And, N1_1, N2);
+        SelectOp->setFlags(Flags);
+        return SelectOp;
       }
     }
-    // ctselect Cond0, X, (ctselect Cond1, X, Y) -> ctselect (or Cond0, Cond1),
-    // X, Y
+
+    // Pattern 2: ctselect C0, X, (ctselect C1, X, Y) -> ctselect (C0 | C1), X,
+    // Y
+    //   Semantic equivalence: If C0 is true, choose X. If C0 is false, 
evaluate
+    //   inner select (C1 ? X : Y). This is equivalent to (C0 || C1) ? X : Y.
     if (N2->getOpcode() == ISD::CTSELECT && N2->hasOneUse()) {
       SDValue N2_0 = N2->getOperand(0);
       SDValue N2_1 = N2->getOperand(1);
       SDValue N2_2 = N2->getOperand(2);
       if (N2_1 == N1 && N0.getValueType() == N2_0.getValueType()) {
-        // Create the actual or node if we can generate good code for it.
-        if (!normalizeToSequence) {
-          SDValue Or = DAG.getNode(ISD::OR, DL, N0.getValueType(), N0, N2_0);
-          return DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(), Or, N1, 
N2_2,
-                             Flags);
-        }
-        // Otherwise see if we can optimize to a better pattern.
-        if (SDValue Combined = visitORLike(N0, N2_0, DL))
-          return DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(), Combined, 
N1,
-                             N2_2, Flags);
+        SDValue Or = DAG.getNode(ISD::OR, DL, N0.getValueType(), N0, N2_0);
+        SDValue SelectOp =
+            DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(), Or, N1, N2_2);
+        SelectOp->setFlags(Flags);
+        return SelectOp;
       }
     }
   }
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp 
b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index ae5a47fa74844..f9c0456651c07 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -4249,19 +4249,51 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     Tmp3 = Node->getOperand(2);
     EVT VT = Tmp2.getValueType();
     if (VT.isVector()) {
-      SmallVector<SDValue> Elements;
-      unsigned NumElements = VT.getVectorNumElements();
-      EVT ScalarVT = VT.getScalarType();
-      for (unsigned Idx = 0; Idx < NumElements; ++Idx) {
-        SDValue IdxVal = DAG.getConstant(Idx, dl, MVT::i64);
-        SDValue TVal =
-            DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, Tmp2, IdxVal);
-        SDValue FVal =
-            DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, Tmp3, IdxVal);
-        Elements.push_back(
-            DAG.getCTSelect(dl, ScalarVT, Tmp1, TVal, FVal, Node->getFlags()));
+      // Constant-time vector blending using pattern F ^ ((T ^ F) & Mask)
+      // where Mask = broadcast(i1 ? -1 : 0) to match vector element width.
+      //
+      // This formulation uses only XOR and AND operations, avoiding branches
+      // that would leak timing information. It's equivalent to:
+      //   Mask==0xFF: F ^ ((T ^ F) & 0xFF) = F ^ (T ^ F) = T
+      //   Mask==0x00: F ^ ((T ^ F) & 0x00) = F ^ 0 = F
+
+      EVT IntVT = VT;
+      SDValue T = Tmp2; // True value
+      SDValue F = Tmp3; // False value
+
+      // Step 1: Handle floating-point vectors by bitcasting to integer
+      if (VT.isFloatingPoint()) {
+        IntVT = EVT::getVectorVT(
+            *DAG.getContext(),
+            EVT::getIntegerVT(*DAG.getContext(), VT.getScalarSizeInBits()),
+            VT.getVectorElementCount());
+        T = DAG.getNode(ISD::BITCAST, dl, IntVT, T);
+        F = DAG.getNode(ISD::BITCAST, dl, IntVT, F);
       }
-      Tmp1 = DAG.getBuildVector(VT, dl, Elements);
+
+      // Step 2: Broadcast the i1 condition to a vector of i1s
+      // Creates [cond, cond, cond, ...] with i1 elements
+      EVT VecI1Ty = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+                                     VT.getVectorNumElements());
+      SDValue VecCond = DAG.getSplatBuildVector(VecI1Ty, dl, Tmp1);
+
+      // Step 3: Sign-extend i1 vector to get all-bits mask
+      // true (i1=1) -> 0xFFFFFFFF..., false (i1=0) -> 0x00000000
+      // Sign extension is constant-time: pure arithmetic, no branches
+      SDValue Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, IntVT, VecCond);
+
+      // Step 4: Compute constant-time blend: F ^ ((T ^ F) & Mask)
+      // All operations (XOR, AND) execute in constant time
+      SDValue TXorF = DAG.getNode(ISD::XOR, dl, IntVT, T, F);
+      SDValue MaskedDiff = DAG.getNode(ISD::AND, dl, IntVT, TXorF, Mask);
+      Tmp1 = DAG.getNode(ISD::XOR, dl, IntVT, F, MaskedDiff);
+
+      // Step 5: Bitcast back to original floating-point type if needed
+      if (VT.isFloatingPoint()) {
+        Tmp1 = DAG.getNode(ISD::BITCAST, dl, VT, Tmp1);
+      }
+
+      Tmp1->setFlags(Node->getFlags());
     } else if (VT.isFloatingPoint()) {
       EVT IntegerVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
       Tmp2 = DAG.getBitcast(IntegerVT, Tmp2);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp 
b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index 6be66663e5c74..1f2d50e8af8d5 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -1572,7 +1572,7 @@ void DAGTypeLegalizer::ExpandFloatResult(SDNode *N, 
unsigned ResNo) {
   case ISD::POISON:
   case ISD::UNDEF:        SplitRes_UNDEF(N, Lo, Hi); break;
   case ISD::SELECT:       SplitRes_Select(N, Lo, Hi); break;
-  case ISD::CTSELECT:     SplitRes_Select(N, Lo, Hi); break;
+  case ISD::CTSELECT:     SplitRes_CTSELECT(N, Lo, Hi); break;
   case ISD::SELECT_CC:    SplitRes_SELECT_CC(N, Lo, Hi); break;
 
   case ISD::MERGE_VALUES:       ExpandRes_MERGE_VALUES(N, ResNo, Lo, Hi); 
break;
@@ -2930,7 +2930,7 @@ void DAGTypeLegalizer::PromoteFloatResult(SDNode *N, 
unsigned ResNo) {
       break;
     case ISD::SELECT:     R = PromoteFloatRes_SELECT(N); break;
     case ISD::CTSELECT:
-      R = PromoteFloatRes_SELECT(N);
+      R = PromoteFloatRes_CTSELECT(N);
       break;
     case ISD::SELECT_CC:  R = PromoteFloatRes_SELECT_CC(N); break;
 
@@ -3238,6 +3238,11 @@ SDValue DAGTypeLegalizer::PromoteFloatRes_SELECT(SDNode 
*N) {
                      N->getOperand(0), TrueVal, FalseVal);
 }
 
+SDValue DAGTypeLegalizer::PromoteFloatRes_CTSELECT(SDNode *N) {
+  // Keep CTSELECT behavior aligned with SELECT promotion logic.
+  return PromoteFloatRes_SELECT(N);
+}
+
 // Construct a new SELECT_CC node with the promoted true- and false- values.
 // The operands used for comparison are promoted by PromoteFloatOp_SELECT_CC.
 SDValue DAGTypeLegalizer::PromoteFloatRes_SELECT_CC(SDNode *N) {
@@ -3419,7 +3424,7 @@ void DAGTypeLegalizer::SoftPromoteHalfResult(SDNode *N, 
unsigned ResNo) {
     break;
   case ISD::SELECT:      R = SoftPromoteHalfRes_SELECT(N); break;
   case ISD::CTSELECT:
-    R = SoftPromoteHalfRes_SELECT(N);
+    R = SoftPromoteHalfRes_CTSELECT(N);
     break;
   case ISD::SELECT_CC:   R = SoftPromoteHalfRes_SELECT_CC(N); break;
   case ISD::STRICT_SINT_TO_FP:
@@ -3665,6 +3670,13 @@ SDValue 
DAGTypeLegalizer::SoftPromoteHalfRes_SELECT(SDNode *N) {
                        Op2);
 }
 
+SDValue DAGTypeLegalizer::SoftPromoteHalfRes_CTSELECT(SDNode *N) {
+  SDValue Op1 = GetSoftPromotedHalf(N->getOperand(1));
+  SDValue Op2 = GetSoftPromotedHalf(N->getOperand(2));
+  return DAG.getCTSelect(SDLoc(N), Op1.getValueType(), N->getOperand(0), Op1,
+                         Op2);
+}
+
 SDValue DAGTypeLegalizer::SoftPromoteHalfRes_SELECT_CC(SDNode *N) {
   SDValue Op2 = GetSoftPromotedHalf(N->getOperand(2));
   SDValue Op3 = GetSoftPromotedHalf(N->getOperand(3));
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp 
b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index ac20f8a009600..2318ed7834dc0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -3006,7 +3006,7 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, 
unsigned ResNo) {
   case ISD::MERGE_VALUES: SplitRes_MERGE_VALUES(N, ResNo, Lo, Hi); break;
   case ISD::SELECT:       SplitRes_Select(N, Lo, Hi); break;
   case ISD::CTSELECT:
-    SplitRes_Select(N, Lo, Hi);
+    SplitRes_CTSELECT(N, Lo, Hi);
     break;
   case ISD::SELECT_CC:    SplitRes_SELECT_CC(N, Lo, Hi); break;
   case ISD::POISON:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h 
b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 62069b4fb03a3..33784418db499 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -784,6 +784,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue PromoteFloatRes_LOAD(SDNode *N);
   SDValue PromoteFloatRes_ATOMIC_LOAD(SDNode *N);
   SDValue PromoteFloatRes_SELECT(SDNode *N);
+  SDValue PromoteFloatRes_CTSELECT(SDNode *N);
   SDValue PromoteFloatRes_SELECT_CC(SDNode *N);
   SDValue PromoteFloatRes_UnaryOp(SDNode *N);
   SDValue PromoteFloatRes_AssertNoFPClass(SDNode *N);
@@ -834,6 +835,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue SoftPromoteHalfRes_LOAD(SDNode *N);
   SDValue SoftPromoteHalfRes_ATOMIC_LOAD(SDNode *N);
   SDValue SoftPromoteHalfRes_SELECT(SDNode *N);
+  SDValue SoftPromoteHalfRes_CTSELECT(SDNode *N);
   SDValue SoftPromoteHalfRes_SELECT_CC(SDNode *N);
   SDValue SoftPromoteHalfRes_UnaryOp(SDNode *N);
   SDValue SoftPromoteHalfRes_FABS(SDNode *N);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp 
b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
index 098368ef2f6b3..a7187f3ae2bc7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
@@ -571,17 +571,9 @@ void DAGTypeLegalizer::SplitRes_Select(SDNode *N, SDValue 
&Lo, SDValue &Hi) {
 }
 
 void DAGTypeLegalizer::SplitRes_CTSELECT(SDNode *N, SDValue &Lo, SDValue &Hi) {
-  SDValue LL, LH, RL, RH, CL, CH;
-  SDLoc dl(N);
-  GetSplitOp(N->getOperand(1), LL, LH);
-  GetSplitOp(N->getOperand(2), RL, RH);
-
-  SDValue Cond = N->getOperand(0);
-  CL = CH = Cond;
-  assert(!Cond.getValueType().isVector() && "Unsupported vector type");
-
-  Lo = DAG.getNode(N->getOpcode(), dl, LL.getValueType(), CL, LL, RL);
-  Hi = DAG.getNode(N->getOpcode(), dl, LH.getValueType(), CH, LH, RH);
+  // Reuse generic select splitting to support scalar and vector conditions.
+  // SplitRes_Select rebuilds with N->getOpcode(), so CTSELECT is preserved.
+  SplitRes_Select(N, Lo, Hi);
 }
 
 void DAGTypeLegalizer::SplitRes_SELECT_CC(SDNode *N, SDValue &Lo,
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp 
b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 6eee61da2e0d1..b7e78b94c0687 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -8179,7 +8179,6 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const 
SDLoc &DL, EVT VT,
       return V;
     break;
   }
-
   case ISD::SELECT:
   case ISD::VSELECT:
     if (SDValue V = simplifySelect(N1, N2, N3))
diff --git a/llvm/test/CodeGen/RISCV/ctselect-fallback.ll 
b/llvm/test/CodeGen/RISCV/ctselect-fallback.ll
index f46bde0a05b8b..c624c17d7e33e 100644
--- a/llvm/test/CodeGen/RISCV/ctselect-fallback.ll
+++ b/llvm/test/CodeGen/RISCV/ctselect-fallback.ll
@@ -10,7 +10,7 @@ define i8 @test_ctselect_i8(i1 %cond, i8 %a, i8 %b) {
 ; RV64-NEXT:    slli a0, a0, 63
 ; RV64-NEXT:    srai a0, a0, 63
 ; RV64-NEXT:    and a0, a1, a0
-; RV64-NEXT:    xor a0, a0, a2
+; RV64-NEXT:    xor a0, a2, a0
 ; RV64-NEXT:    ret
 ;
 ; RV32-LABEL: test_ctselect_i8:
@@ -19,52 +19,28 @@ define i8 @test_ctselect_i8(i1 %cond, i8 %a, i8 %b) {
 ; RV32-NEXT:    slli a0, a0, 31
 ; RV32-NEXT:    srai a0, a0, 31
 ; RV32-NEXT:    and a0, a1, a0
-; RV32-NEXT:    xor a0, a0, a2
+; RV32-NEXT:    xor a0, a2, a0
 ; RV32-NEXT:    ret
   %result = call i8 @llvm.ct.select.i8(i1 %cond, i8 %a, i8 %b)
   ret i8 %result
 }
-
-define i16 @test_ctselect_i16(i1 %cond, i16 %a, i16 %b) {
-; RV64-LABEL: test_ctselect_i16:
+define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) {
+; RV64-LABEL: test_ctselect_i32:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    xor a1, a1, a2
 ; RV64-NEXT:    slli a0, a0, 63
 ; RV64-NEXT:    srai a0, a0, 63
 ; RV64-NEXT:    and a0, a1, a0
-; RV64-NEXT:    xor a0, a0, a2
+; RV64-NEXT:    xor a0, a2, a0
 ; RV64-NEXT:    ret
 ;
-; RV32-LABEL: test_ctselect_i16:
+; RV32-LABEL: test_ctselect_i32:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    xor a1, a1, a2
 ; RV32-NEXT:    slli a0, a0, 31
 ; RV32-NEXT:    srai a0, a0, 31
 ; RV32-NEXT:    and a0, a1, a0
-; RV32-NEXT:    xor a0, a0, a2
-; RV32-NEXT:    ret
-  %result = call i16 @llvm.ct.select.i16(i1 %cond, i16 %a, i16 %b)
-  ret i16 %result
-}
-
-define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) {
-; RV64-LABEL: test_ctselect_i32:
-; RV64:       # %bb.0:
-; RV64-NEXT:    xor a1, a1, a2
-; RV64-NEXT:    slli a0, a0, 63
-; RV64-NEXT:    srai a0, a0, 63
-; RV64-NEXT:    and a0, a1, a0
-; RV64-NEXT:    xor a0, a0, a2
-; RV64-NEXT:    ret
-;
-; RV32-LABEL: test_ctselect_i32:
-; RV32:       # %bb.0:
-; RV32-NEXT:    andi a0, a0, 1
-; RV32-NEXT:    neg a3, a0
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a1, a3, a1
-; RV32-NEXT:    and a0, a0, a2
-; RV32-NEXT:    or a0, a1, a0
+; RV32-NEXT:    xor a0, a2, a0
 ; RV32-NEXT:    ret
   %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
   ret i32 %result
@@ -73,12 +49,11 @@ define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) {
 define i64 @test_ctselect_i64(i1 %cond, i64 %a, i64 %b) {
 ; RV64-LABEL: test_ctselect_i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    andi a0, a0, 1
-; RV64-NEXT:    neg a3, a0
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a1, a3, a1
-; RV64-NEXT:    and a0, a0, a2
-; RV64-NEXT:    or a0, a1, a0
+; RV64-NEXT:    xor a1, a1, a2
+; RV64-NEXT:    slli a0, a0, 63
+; RV64-NEXT:    srai a0, a0, 63
+; RV64-NEXT:    and a0, a1, a0
+; RV64-NEXT:    xor a0, a2, a0
 ; RV64-NEXT:    ret
 ;
 ; RV32-LABEL: test_ctselect_i64:
@@ -89,8 +64,8 @@ define i64 @test_ctselect_i64(i1 %cond, i64 %a, i64 %b) {
 ; RV32-NEXT:    srai a0, a0, 31
 ; RV32-NEXT:    and a1, a1, a0
 ; RV32-NEXT:    and a2, a2, a0
-; RV32-NEXT:    xor a0, a1, a3
-; RV32-NEXT:    xor a1, a2, a4
+; RV32-NEXT:    xor a0, a3, a1
+; RV32-NEXT:    xor a1, a4, a2
 ; RV32-NEXT:    ret
   %result = call i64 @llvm.ct.select.i64(i1 %cond, i64 %a, i64 %b)
   ret i64 %result
@@ -99,22 +74,20 @@ define i64 @test_ctselect_i64(i1 %cond, i64 %a, i64 %b) {
 define ptr @test_ctselect_ptr(i1 %cond, ptr %a, ptr %b) {
 ; RV64-LABEL: test_ctselect_ptr:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    andi a0, a0, 1
-; RV64-NEXT:    neg a3, a0
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a1, a3, a1
-; RV64-NEXT:    and a0, a0, a2
-; RV64-NEXT:    or a0, a1, a0
+; RV64-NEXT:    xor a1, a1, a2
+; RV64-NEXT:    slli a0, a0, 63
+; RV64-NEXT:    srai a0, a0, 63
+; RV64-NEXT:    and a0, a1, a0
+; RV64-NEXT:    xor a0, a2, a0
 ; RV64-NEXT:    ret
 ;
 ; RV32-LABEL: test_ctselect_ptr:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    andi a0, a0, 1
-; RV32-NEXT:    neg a3, a0
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a1, a3, a1
-; RV32-NEXT:    and a0, a0, a2
-; RV32-NEXT:    or a0, a1, a0
+; RV32-NEXT:    xor a1, a1, a2
+; RV32-NEXT:    slli a0, a0, 31
+; RV32-NEXT:    srai a0, a0, 31
+; RV32-NEXT:    and a0, a1, a0
+; RV32-NEXT:    xor a0, a2, a0
 ; RV32-NEXT:    ret
   %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %a, ptr %b)
   ret ptr %result
@@ -128,6 +101,8 @@ define i32 @test_ctselect_const_true(i32 %a, i32 %b) {
 ;
 ; RV32-LABEL: test_ctselect_const_true:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    xor a0, a0, a1
+; RV32-NEXT:    xor a0, a1, a0
 ; RV32-NEXT:    ret
   %result = call i32 @llvm.ct.select.i32(i1 true, i32 %a, i32 %b)
   ret i32 %result
@@ -158,131 +133,199 @@ define i32 @test_ctselect_icmp_eq(i32 %x, i32 %y, i32 
%a, i32 %b) {
 ; RV64-NEXT:    xor a2, a2, a3
 ; RV64-NEXT:    addi a0, a0, -1
 ; RV64-NEXT:    and a0, a2, a0
-; RV64-NEXT:    xor a0, a0, a3
+; RV64-NEXT:    xor a0, a3, a0
 ; RV64-NEXT:    ret
 ;
 ; RV32-LABEL: test_ctselect_icmp_eq:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    xor a0, a0, a1
 ; RV32-NEXT:    snez a0, a0
+; RV32-NEXT:    xor a2, a2, a3
 ; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a2, a0, a2
-; RV32-NEXT:    not a0, a0
-; RV32-NEXT:    and a0, a0, a3
-; RV32-NEXT:    or a0, a2, a0
+; RV32-NEXT:    and a0, a2, a0
+; RV32-NEXT:    xor a0, a3, a0
 ; RV32-NEXT:    ret
   %cond = icmp eq i32 %x, %y
   %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
   ret i32 %result
 }
-
-define i32 @test_ctselect_icmp_ne(i32 %x, i32 %y, i32 %a, i32 %b) {
-; RV64-LABEL: test_ctselect_icmp_ne:
+define i32 @test_ctselect_icmp_ult(i32 %x, i32 %y, i32 %a, i32 %b) {
+; RV64-LABEL: test_ctselect_icmp_ult:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    sext.w a1, a1
 ; RV64-NEXT:    sext.w a0, a0
-; RV64-NEXT:    xor a0, a0, a1
-; RV64-NEXT:    seqz a0, a0
+; RV64-NEXT:    sltu a0, a0, a1
 ; RV64-NEXT:    xor a2, a2, a3
-; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    neg a0, a0
 ; RV64-NEXT:    and a0, a2, a0
-; RV64-NEXT:    xor a0, a0, a3
+; RV64-NEXT:    xor a0, a3, a0
 ; RV64-NEXT:    ret
 ;
-; RV32-LABEL: test_ctselect_icmp_ne:
+; RV32-LABEL: test_ctselect_icmp_ult:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    xor a0, a0, a1
-; RV32-NEXT:    seqz a0, a0
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a2, a0, a2
-; RV32-NEXT:    not a0, a0
-; RV32-NEXT:    and a0, a0, a3
-; RV32-NEXT:    or a0, a2, a0
+; RV32-NEXT:    sltu a0, a0, a1
+; RV32-NEXT:    xor a2, a2, a3
+; RV32-NEXT:    neg a0, a0
+; RV32-NEXT:    and a0, a2, a0
+; RV32-NEXT:    xor a0, a3, a0
 ; RV32-NEXT:    ret
-  %cond = icmp ne i32 %x, %y
+  %cond = icmp ult i32 %x, %y
   %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
   ret i32 %result
 }
 
-define i32 @test_ctselect_icmp_slt(i32 %x, i32 %y, i32 %a, i32 %b) {
-; RV64-LABEL: test_ctselect_icmp_slt:
+; Test with memory operands
+define i32 @test_ctselect_load(i1 %cond, ptr %p1, ptr %p2) {
+; RV64-LABEL: test_ctselect_load:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    sext.w a1, a1
-; RV64-NEXT:    sext.w a0, a0
-; RV64-NEXT:    slt a0, a0, a1
+; RV64-NEXT:    lw a1, 0(a1)
+; RV64-NEXT:    lw a2, 0(a2)
+; RV64-NEXT:    slli a0, a0, 63
+; RV64-NEXT:    xor a1, a1, a2
+; RV64-NEXT:    srai a0, a0, 63
+; RV64-NEXT:    and a0, a1, a0
+; RV64-NEXT:    xor a0, a2, a0
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_load:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    lw a2, 0(a2)
+; RV32-NEXT:    slli a0, a0, 31
+; RV32-NEXT:    xor a1, a1, a2
+; RV32-NEXT:    srai a0, a0, 31
+; RV32-NEXT:    and a0, a1, a0
+; RV32-NEXT:    xor a0, a2, a0
+; RV32-NEXT:    ret
+  %a = load i32, ptr %p1
+  %b = load i32, ptr %p2
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+; Test nested CTSELECT pattern with AND merging on i1 values
+; Pattern: ctselect C0, (ctselect C1, X, Y), Y -> ctselect (C0 & C1), X, Y
+define i32 @test_ctselect_nested_and_i1_to_i32(i1 %c0, i1 %c1, i32 %x, i32 %y) 
{
+; RV64-LABEL: test_ctselect_nested_and_i1_to_i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    and a0, a1, a0
 ; RV64-NEXT:    xor a2, a2, a3
-; RV64-NEXT:    neg a0, a0
+; RV64-NEXT:    slli a0, a0, 63
+; RV64-NEXT:    srai a0, a0, 63
 ; RV64-NEXT:    and a0, a2, a0
-; RV64-NEXT:    xor a0, a0, a3
+; RV64-NEXT:    xor a0, a3, a0
 ; RV64-NEXT:    ret
 ;
-; RV32-LABEL: test_ctselect_icmp_slt:
+; RV32-LABEL: test_ctselect_nested_and_i1_to_i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    slt a0, a0, a1
-; RV32-NEXT:    neg a1, a0
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a1, a1, a2
-; RV32-NEXT:    and a0, a0, a3
-; RV32-NEXT:    or a0, a1, a0
+; RV32-NEXT:    and a0, a1, a0
+; RV32-NEXT:    xor a2, a2, a3
+; RV32-NEXT:    slli a0, a0, 31
+; RV32-NEXT:    srai a0, a0, 31
+; RV32-NEXT:    and a0, a2, a0
+; RV32-NEXT:    xor a0, a3, a0
 ; RV32-NEXT:    ret
-  %cond = icmp slt i32 %x, %y
-  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  %inner = call i1 @llvm.ct.select.i1(i1 %c1, i1 true, i1 false)
+  %cond = call i1 @llvm.ct.select.i1(i1 %c0, i1 %inner, i1 false)
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %x, i32 %y)
   ret i32 %result
 }
 
-define i32 @test_ctselect_icmp_ult(i32 %x, i32 %y, i32 %a, i32 %b) {
-; RV64-LABEL: test_ctselect_icmp_ult:
+; Test nested CTSELECT pattern with OR merging on i1 values
+; Pattern: ctselect C0, X, (ctselect C1, X, Y) -> ctselect (C0 | C1), X, Y
+define i32 @test_ctselect_nested_or_i1_to_i32(i1 %c0, i1 %c1, i32 %x, i32 %y) {
+; RV64-LABEL: test_ctselect_nested_or_i1_to_i32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    sext.w a1, a1
-; RV64-NEXT:    sext.w a0, a0
-; RV64-NEXT:    sltu a0, a0, a1
+; RV64-NEXT:    or a0, a0, a1
 ; RV64-NEXT:    xor a2, a2, a3
-; RV64-NEXT:    neg a0, a0
+; RV64-NEXT:    slli a0, a0, 63
+; RV64-NEXT:    srai a0, a0, 63
 ; RV64-NEXT:    and a0, a2, a0
-; RV64-NEXT:    xor a0, a0, a3
+; RV64-NEXT:    xor a0, a3, a0
 ; RV64-NEXT:    ret
 ;
-; RV32-LABEL: test_ctselect_icmp_ult:
+; RV32-LABEL: test_ctselect_nested_or_i1_to_i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    sltu a0, a0, a1
-; RV32-NEXT:    neg a1, a0
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a1, a1, a2
-; RV32-NEXT:    and a0, a0, a3
-; RV32-NEXT:    or a0, a1, a0
+; RV32-NEXT:    or a0, a0, a1
+; RV32-NEXT:    xor a2, a2, a3
+; RV32-NEXT:    slli a0, a0, 31
+; RV32-NEXT:    srai a0, a0, 31
+; RV32-NEXT:    and a0, a2, a0
+; RV32-NEXT:    xor a0, a3, a0
 ; RV32-NEXT:    ret
-  %cond = icmp ult i32 %x, %y
-  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  %inner = call i1 @llvm.ct.select.i1(i1 %c1, i1 true, i1 false)
+  %cond = call i1 @llvm.ct.select.i1(i1 %c0, i1 true, i1 %inner)
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %x, i32 %y)
   ret i32 %result
 }
 
-; Test with memory operands
-define i32 @test_ctselect_load(i1 %cond, ptr %p1, ptr %p2) {
-; RV64-LABEL: test_ctselect_load:
+; Test double nested CTSELECT with recursive AND merging
+; Pattern: ctselect C0, (ctselect C1, (ctselect C2, X, Y), Y), Y
+;   -> ctselect (C0 & C1 & C2), X, Y
+define i32 @test_ctselect_double_nested_and_i1(i1 %c0, i1 %c1, i1 %c2, i32 %x, 
i32 %y) {
+; RV64-LABEL: test_ctselect_double_nested_and_i1:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lw a1, 0(a1)
-; RV64-NEXT:    lw a2, 0(a2)
+; RV64-NEXT:    and a1, a2, a1
+; RV64-NEXT:    and a0, a1, a0
+; RV64-NEXT:    xor a3, a3, a4
 ; RV64-NEXT:    slli a0, a0, 63
-; RV64-NEXT:    xor a1, a1, a2
 ; RV64-NEXT:    srai a0, a0, 63
+; RV64-NEXT:    and a0, a3, a0
+; RV64-NEXT:    xor a0, a4, a0
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_double_nested_and_i1:
+; RV32:       # %bb.0:
+; RV32-NEXT:    and a1, a2, a1
+; RV32-NEXT:    and a0, a1, a0
+; RV32-NEXT:    xor a3, a3, a4
+; RV32-NEXT:    slli a0, a0, 31
+; RV32-NEXT:    srai a0, a0, 31
+; RV32-NEXT:    and a0, a3, a0
+; RV32-NEXT:    xor a0, a4, a0
+; RV32-NEXT:    ret
+  %inner2 = call i1 @llvm.ct.select.i1(i1 %c2, i1 true, i1 false)
+  %inner1 = call i1 @llvm.ct.select.i1(i1 %c1, i1 %inner2, i1 false)
+  %cond = call i1 @llvm.ct.select.i1(i1 %c0, i1 %inner1, i1 false)
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %x, i32 %y)
+  ret i32 %result
+}
+
+; Test double nested CTSELECT with mixed AND/OR patterns
+define i32 @test_ctselect_double_nested_mixed_i1(i1 %c0, i1 %c1, i1 %c2, i32 
%x, i32 %y, i32 %z) {
+; RV64-LABEL: test_ctselect_double_nested_mixed_i1:
+; RV64:       # %bb.0:
 ; RV64-NEXT:    and a0, a1, a0
-; RV64-NEXT:    xor a0, a0, a2
+; RV64-NEXT:    xor a3, a3, a4
+; RV64-NEXT:    or a0, a0, a2
+; RV64-NEXT:    slli a0, a0, 63
+; RV64-NEXT:    srai a0, a0, 63
+; RV64-NEXT:    and a3, a3, a0
+; RV64-NEXT:    xor a4, a4, a5
+; RV64-NEXT:    xor a3, a4, a3
+; RV64-NEXT:    and a0, a3, a0
+; RV64-NEXT:    xor a0, a5, a0
 ; RV64-NEXT:    ret
 ;
-; RV32-LABEL: test_ctselect_load:
+; RV32-LABEL: test_ctselect_double_nested_mixed_i1:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lw a1, 0(a1)
-; RV32-NEXT:    lw a2, 0(a2)
-; RV32-NEXT:    andi a0, a0, 1
-; RV32-NEXT:    neg a3, a0
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a1, a3, a1
-; RV32-NEXT:    and a0, a0, a2
-; RV32-NEXT:    or a0, a1, a0
+; RV32-NEXT:    and a0, a1, a0
+; RV32-NEXT:    xor a3, a3, a4
+; RV32-NEXT:    or a0, a0, a2
+; RV32-NEXT:    slli a0, a0, 31
+; RV32-NEXT:    srai a0, a0, 31
+; RV32-NEXT:    and a3, a3, a0
+; RV32-NEXT:    xor a4, a4, a5
+; RV32-NEXT:    xor a3, a4, a3
+; RV32-NEXT:    and a0, a3, a0
+; RV32-NEXT:    xor a0, a5, a0
 ; RV32-NEXT:    ret
-  %a = load i32, ptr %p1
-  %b = load i32, ptr %p2
-  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  %inner1 = call i1 @llvm.ct.select.i1(i1 %c1, i1 true, i1 false)
+  %and_cond = call i1 @llvm.ct.select.i1(i1 %c0, i1 %inner1, i1 false)
+  %inner2 = call i1 @llvm.ct.select.i1(i1 %c2, i1 true, i1 false)
+  %or_cond = call i1 @llvm.ct.select.i1(i1 %and_cond, i1 true, i1 %inner2)
+  %inner_result = call i32 @llvm.ct.select.i32(i1 %or_cond, i32 %x, i32 %y)
+  %result = call i32 @llvm.ct.select.i32(i1 %or_cond, i32 %inner_result, i32 
%z)
   ret i32 %result
 }
 
@@ -296,35 +339,416 @@ define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, 
i32 %a, i32 %b, i32 %c) {
 ; RV64-NEXT:    slli a0, a0, 63
 ; RV64-NEXT:    srai a1, a1, 63
 ; RV64-NEXT:    and a1, a2, a1
-; RV64-NEXT:    xor a1, a1, a3
+; RV64-NEXT:    xor a1, a3, a1
 ; RV64-NEXT:    srai a0, a0, 63
 ; RV64-NEXT:    and a0, a1, a0
-; RV64-NEXT:    xor a0, a0, a4
+; RV64-NEXT:    xor a0, a4, a0
 ; RV64-NEXT:    ret
 ;
 ; RV32-LABEL: test_ctselect_nested:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    andi a1, a1, 1
-; RV32-NEXT:    andi a0, a0, 1
-; RV32-NEXT:    neg a5, a1
-; RV32-NEXT:    addi a1, a1, -1
-; RV32-NEXT:    and a2, a5, a2
-; RV32-NEXT:    neg a5, a0
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a1, a1, a3
-; RV32-NEXT:    or a1, a2, a1
-; RV32-NEXT:    and a1, a5, a1
-; RV32-NEXT:    and a0, a0, a4
-; RV32-NEXT:    or a0, a1, a0
+; RV32-NEXT:    xor a2, a2, a3
+; RV32-NEXT:    slli a1, a1, 31
+; RV32-NEXT:    xor a3, a3, a4
+; RV32-NEXT:    slli a0, a0, 31
+; RV32-NEXT:    srai a1, a1, 31
+; RV32-NEXT:    and a1, a2, a1
+; RV32-NEXT:    xor a1, a3, a1
+; RV32-NEXT:    srai a0, a0, 31
+; RV32-NEXT:    and a0, a1, a0
+; RV32-NEXT:    xor a0, a4, a0
 ; RV32-NEXT:    ret
   %inner = call i32 @llvm.ct.select.i32(i1 %cond2, i32 %a, i32 %b)
   %result = call i32 @llvm.ct.select.i32(i1 %cond1, i32 %inner, i32 %c)
   ret i32 %result
 }
 
+; Test floating-point ct.select selecting between NaN and Inf
+define float @test_ctselect_f32_nan_inf(i1 %cond) {
+; RV64-LABEL: test_ctselect_f32_nan_inf:
+; RV64:       # %bb.0:
+; RV64-NEXT:    slli a0, a0, 63
+; RV64-NEXT:    lui a1, 1024
+; RV64-NEXT:    srai a0, a0, 63
+; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    lui a1, 522240
+; RV64-NEXT:    or a0, a0, a1
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_f32_nan_inf:
+; RV32:       # %bb.0:
+; RV32-NEXT:    slli a0, a0, 31
+; RV32-NEXT:    lui a1, 1024
+; RV32-NEXT:    srai a0, a0, 31
+; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    lui a1, 522240
+; RV32-NEXT:    xor a0, a0, a1
+; RV32-NEXT:    ret
+  %result = call float @llvm.ct.select.f32(i1 %cond, float 0x7FF8000000000000, 
float 0x7FF0000000000000)
+  ret float %result
+}
+
+define double @test_ctselect_f64_nan_inf(i1 %cond) {
+; RV64-LABEL: test_ctselect_f64_nan_inf:
+; RV64:       # %bb.0:
+; RV64-NEXT:    slli a0, a0, 63
+; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    srai a0, a0, 63
+; RV64-NEXT:    slli a1, a1, 51
+; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    li a1, 2047
+; RV64-NEXT:    slli a1, a1, 52
+; RV64-NEXT:    xor a0, a0, a1
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_f64_nan_inf:
+; RV32:       # %bb.0:
+; RV32-NEXT:    slli a0, a0, 31
+; RV32-NEXT:    lui a1, 128
+; RV32-NEXT:    srai a0, a0, 31
+; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    lui a1, 524032
+; RV32-NEXT:    or a1, a0, a1
+; RV32-NEXT:    li a0, 0
+; RV32-NEXT:    ret
+  %result = call double @llvm.ct.select.f64(i1 %cond, double 
0x7FF8000000000000, double 0x7FF0000000000000)
+  ret double %result
+}
+
+; Test basic floating-point ct.select
+define float @test_ctselect_f32(i1 %cond, float %a, float %b) {
+; RV64-LABEL: test_ctselect_f32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    xor a1, a1, a2
+; RV64-NEXT:    slli a0, a0, 63
+; RV64-NEXT:    srai a0, a0, 63
+; RV64-NEXT:    and a0, a1, a0
+; RV64-NEXT:    xor a0, a2, a0
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_f32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    xor a1, a1, a2
+; RV32-NEXT:    slli a0, a0, 31
+; RV32-NEXT:    srai a0, a0, 31
+; RV32-NEXT:    and a0, a1, a0
+; RV32-NEXT:    xor a0, a2, a0
+; RV32-NEXT:    ret
+  %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b)
+  ret float %result
+}
+
+define double @test_ctselect_f64(i1 %cond, double %a, double %b) {
+; RV64-LABEL: test_ctselect_f64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    xor a1, a1, a2
+; RV64-NEXT:    slli a0, a0, 63
+; RV64-NEXT:    srai a0, a0, 63
+; RV64-NEXT:    and a0, a1, a0
+; RV64-NEXT:    xor a0, a2, a0
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_f64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    xor a1, a1, a3
+; RV32-NEXT:    slli a0, a0, 31
+; RV32-NEXT:    xor a2, a2, a4
+; RV32-NEXT:    srai a0, a0, 31
+; RV32-NEXT:    and a1, a1, a0
+; RV32-NEXT:    and a2, a2, a0
+; RV32-NEXT:    xor a0, a3, a1
+; RV32-NEXT:    xor a1, a4, a2
+; RV32-NEXT:    ret
+  %result = call double @llvm.ct.select.f64(i1 %cond, double %a, double %b)
+  ret double %result
+}
+
+; Test vector ct.select with integer vectors
+define <4 x i32> @test_ctselect_v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) {
+; RV64-LABEL: test_ctselect_v4i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    lw a4, 0(a3)
+; RV64-NEXT:    lw a5, 8(a3)
+; RV64-NEXT:    lw a6, 16(a3)
+; RV64-NEXT:    lw a3, 24(a3)
+; RV64-NEXT:    lw a7, 0(a2)
+; RV64-NEXT:    lw t0, 8(a2)
+; RV64-NEXT:    lw t1, 16(a2)
+; RV64-NEXT:    lw a2, 24(a2)
+; RV64-NEXT:    slli a1, a1, 63
+; RV64-NEXT:    srai a1, a1, 63
+; RV64-NEXT:    xor a7, a7, a4
+; RV64-NEXT:    xor t0, t0, a5
+; RV64-NEXT:    xor t1, t1, a6
+; RV64-NEXT:    xor a2, a2, a3
+; RV64-NEXT:    and a7, a7, a1
+; RV64-NEXT:    and t0, t0, a1
+; RV64-NEXT:    and t1, t1, a1
+; RV64-NEXT:    and a1, a2, a1
+; RV64-NEXT:    xor a2, a4, a7
+; RV64-NEXT:    xor a4, a5, t0
+; RV64-NEXT:    xor a5, a6, t1
+; RV64-NEXT:    xor a1, a3, a1
+; RV64-NEXT:    sw a2, 0(a0)
+; RV64-NEXT:    sw a4, 4(a0)
+; RV64-NEXT:    sw a5, 8(a0)
+; RV64-NEXT:    sw a1, 12(a0)
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_v4i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lw a4, 0(a3)
+; RV32-NEXT:    lw a5, 4(a3)
+; RV32-NEXT:    lw a6, 8(a3)
+; RV32-NEXT:    lw a3, 12(a3)
+; RV32-NEXT:    lw a7, 0(a2)
+; RV32-NEXT:    lw t0, 4(a2)
+; RV32-NEXT:    lw t1, 8(a2)
+; RV32-NEXT:    lw a2, 12(a2)
+; RV32-NEXT:    slli a1, a1, 31
+; RV32-NEXT:    srai a1, a1, 31
+; RV32-NEXT:    xor a7, a7, a4
+; RV32-NEXT:    xor t0, t0, a5
+; RV32-NEXT:    xor t1, t1, a6
+; RV32-NEXT:    xor a2, a2, a3
+; RV32-NEXT:    and a7, a7, a1
+; RV32-NEXT:    and t0, t0, a1
+; RV32-NEXT:    and t1, t1, a1
+; RV32-NEXT:    and a1, a2, a1
+; RV32-NEXT:    xor a2, a4, a7
+; RV32-NEXT:    xor a4, a5, t0
+; RV32-NEXT:    xor a5, a6, t1
+; RV32-NEXT:    xor a1, a3, a1
+; RV32-NEXT:    sw a2, 0(a0)
+; RV32-NEXT:    sw a4, 4(a0)
+; RV32-NEXT:    sw a5, 8(a0)
+; RV32-NEXT:    sw a1, 12(a0)
+; RV32-NEXT:    ret
+  %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x 
i32> %b)
+  ret <4 x i32> %result
+}
+define <4 x float> @test_ctselect_v4f32(i1 %cond, <4 x float> %a, <4 x float> 
%b) {
+; RV64-LABEL: test_ctselect_v4f32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    lw a4, 0(a3)
+; RV64-NEXT:    lw a5, 8(a3)
+; RV64-NEXT:    lw a6, 16(a3)
+; RV64-NEXT:    lw a3, 24(a3)
+; RV64-NEXT:    lw a7, 0(a2)
+; RV64-NEXT:    lw t0, 8(a2)
+; RV64-NEXT:    lw t1, 16(a2)
+; RV64-NEXT:    lw a2, 24(a2)
+; RV64-NEXT:    slli a1, a1, 63
+; RV64-NEXT:    srai a1, a1, 63
+; RV64-NEXT:    xor a7, a7, a4
+; RV64-NEXT:    xor t0, t0, a5
+; RV64-NEXT:    xor t1, t1, a6
+; RV64-NEXT:    xor a2, a2, a3
+; RV64-NEXT:    and a7, a7, a1
+; RV64-NEXT:    and t0, t0, a1
+; RV64-NEXT:    and t1, t1, a1
+; RV64-NEXT:    and a1, a2, a1
+; RV64-NEXT:    xor a2, a4, a7
+; RV64-NEXT:    xor a4, a5, t0
+; RV64-NEXT:    xor a5, a6, t1
+; RV64-NEXT:    xor a1, a3, a1
+; RV64-NEXT:    sw a2, 0(a0)
+; RV64-NEXT:    sw a4, 4(a0)
+; RV64-NEXT:    sw a5, 8(a0)
+; RV64-NEXT:    sw a1, 12(a0)
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_v4f32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lw a4, 0(a3)
+; RV32-NEXT:    lw a5, 4(a3)
+; RV32-NEXT:    lw a6, 8(a3)
+; RV32-NEXT:    lw a3, 12(a3)
+; RV32-NEXT:    lw a7, 0(a2)
+; RV32-NEXT:    lw t0, 4(a2)
+; RV32-NEXT:    lw t1, 8(a2)
+; RV32-NEXT:    lw a2, 12(a2)
+; RV32-NEXT:    slli a1, a1, 31
+; RV32-NEXT:    srai a1, a1, 31
+; RV32-NEXT:    xor a7, a7, a4
+; RV32-NEXT:    xor t0, t0, a5
+; RV32-NEXT:    xor t1, t1, a6
+; RV32-NEXT:    xor a2, a2, a3
+; RV32-NEXT:    and a7, a7, a1
+; RV32-NEXT:    and t0, t0, a1
+; RV32-NEXT:    and t1, t1, a1
+; RV32-NEXT:    and a1, a2, a1
+; RV32-NEXT:    xor a2, a4, a7
+; RV32-NEXT:    xor a4, a5, t0
+; RV32-NEXT:    xor a5, a6, t1
+; RV32-NEXT:    xor a1, a3, a1
+; RV32-NEXT:    sw a2, 0(a0)
+; RV32-NEXT:    sw a4, 4(a0)
+; RV32-NEXT:    sw a5, 8(a0)
+; RV32-NEXT:    sw a1, 12(a0)
+; RV32-NEXT:    ret
+  %result = call <4 x float> @llvm.ct.select.v4f32(i1 %cond, <4 x float> %a, 
<4 x float> %b)
+  ret <4 x float> %result
+}
+define <8 x i32> @test_ctselect_v8i32(i1 %cond, <8 x i32> %a, <8 x i32> %b) {
+; RV64-LABEL: test_ctselect_v8i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -32
+; RV64-NEXT:    .cfi_def_cfa_offset 32
+; RV64-NEXT:    sd s0, 24(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s1, 16(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s2, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    .cfi_offset s0, -8
+; RV64-NEXT:    .cfi_offset s1, -16
+; RV64-NEXT:    .cfi_offset s2, -24
+; RV64-NEXT:    lw a7, 32(a3)
+; RV64-NEXT:    lw a6, 40(a3)
+; RV64-NEXT:    lw a5, 48(a3)
+; RV64-NEXT:    lw a4, 56(a3)
+; RV64-NEXT:    lw t0, 32(a2)
+; RV64-NEXT:    lw t1, 40(a2)
+; RV64-NEXT:    lw t2, 48(a2)
+; RV64-NEXT:    lw t3, 56(a2)
+; RV64-NEXT:    lw t4, 0(a3)
+; RV64-NEXT:    lw t5, 8(a3)
+; RV64-NEXT:    lw t6, 16(a3)
+; RV64-NEXT:    lw a3, 24(a3)
+; RV64-NEXT:    lw s0, 0(a2)
+; RV64-NEXT:    lw s1, 8(a2)
+; RV64-NEXT:    lw s2, 16(a2)
+; RV64-NEXT:    lw a2, 24(a2)
+; RV64-NEXT:    slli a1, a1, 63
+; RV64-NEXT:    srai a1, a1, 63
+; RV64-NEXT:    xor s0, s0, t4
+; RV64-NEXT:    xor s1, s1, t5
+; RV64-NEXT:    xor s2, s2, t6
+; RV64-NEXT:    xor a2, a2, a3
+; RV64-NEXT:    xor t0, t0, a7
+; RV64-NEXT:    xor t1, t1, a6
+; RV64-NEXT:    xor t2, t2, a5
+; RV64-NEXT:    xor t3, t3, a4
+; RV64-NEXT:    and s0, s0, a1
+; RV64-NEXT:    and s1, s1, a1
+; RV64-NEXT:    and s2, s2, a1
+; RV64-NEXT:    and a2, a2, a1
+; RV64-NEXT:    and t0, t0, a1
+; RV64-NEXT:    and t1, t1, a1
+; RV64-NEXT:    and t2, t2, a1
+; RV64-NEXT:    and a1, t3, a1
+; RV64-NEXT:    xor t3, t4, s0
+; RV64-NEXT:    xor t4, t5, s1
+; RV64-NEXT:    xor t5, t6, s2
+; RV64-NEXT:    xor a2, a3, a2
+; RV64-NEXT:    xor a3, a7, t0
+; RV64-NEXT:    xor a6, a6, t1
+; RV64-NEXT:    xor a5, a5, t2
+; RV64-NEXT:    xor a1, a4, a1
+; RV64-NEXT:    sw a3, 16(a0)
+; RV64-NEXT:    sw a6, 20(a0)
+; RV64-NEXT:    sw a5, 24(a0)
+; RV64-NEXT:    sw a1, 28(a0)
+; RV64-NEXT:    sw t3, 0(a0)
+; RV64-NEXT:    sw t4, 4(a0)
+; RV64-NEXT:    sw t5, 8(a0)
+; RV64-NEXT:    sw a2, 12(a0)
+; RV64-NEXT:    ld s0, 24(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s1, 16(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s2, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    .cfi_restore s0
+; RV64-NEXT:    .cfi_restore s1
+; RV64-NEXT:    .cfi_restore s2
+; RV64-NEXT:    addi sp, sp, 32
+; RV64-NEXT:    .cfi_def_cfa_offset 0
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_v8i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s2, 4(sp) # 4-byte Folded Spill
+; RV32-NEXT:    .cfi_offset s0, -4
+; RV32-NEXT:    .cfi_offset s1, -8
+; RV32-NEXT:    .cfi_offset s2, -12
+; RV32-NEXT:    lw a7, 16(a3)
+; RV32-NEXT:    lw a6, 20(a3)
+; RV32-NEXT:    lw a5, 24(a3)
+; RV32-NEXT:    lw a4, 28(a3)
+; RV32-NEXT:    lw t0, 16(a2)
+; RV32-NEXT:    lw t1, 20(a2)
+; RV32-NEXT:    lw t2, 24(a2)
+; RV32-NEXT:    lw t3, 28(a2)
+; RV32-NEXT:    lw t4, 0(a3)
+; RV32-NEXT:    lw t5, 4(a3)
+; RV32-NEXT:    lw t6, 8(a3)
+; RV32-NEXT:    lw a3, 12(a3)
+; RV32-NEXT:    lw s0, 0(a2)
+; RV32-NEXT:    lw s1, 4(a2)
+; RV32-NEXT:    lw s2, 8(a2)
+; RV32-NEXT:    lw a2, 12(a2)
+; RV32-NEXT:    slli a1, a1, 31
+; RV32-NEXT:    srai a1, a1, 31
+; RV32-NEXT:    xor s0, s0, t4
+; RV32-NEXT:    xor s1, s1, t5
+; RV32-NEXT:    xor s2, s2, t6
+; RV32-NEXT:    xor a2, a2, a3
+; RV32-NEXT:    xor t0, t0, a7
+; RV32-NEXT:    xor t1, t1, a6
+; RV32-NEXT:    xor t2, t2, a5
+; RV32-NEXT:    xor t3, t3, a4
+; RV32-NEXT:    and s0, s0, a1
+; RV32-NEXT:    and s1, s1, a1
+; RV32-NEXT:    and s2, s2, a1
+; RV32-NEXT:    and a2, a2, a1
+; RV32-NEXT:    and t0, t0, a1
+; RV32-NEXT:    and t1, t1, a1
+; RV32-NEXT:    and t2, t2, a1
+; RV32-NEXT:    and a1, t3, a1
+; RV32-NEXT:    xor t3, t4, s0
+; RV32-NEXT:    xor t4, t5, s1
+; RV32-NEXT:    xor t5, t6, s2
+; RV32-NEXT:    xor a2, a3, a2
+; RV32-NEXT:    xor a3, a7, t0
+; RV32-NEXT:    xor a6, a6, t1
+; RV32-NEXT:    xor a5, a5, t2
+; RV32-NEXT:    xor a1, a4, a1
+; RV32-NEXT:    sw a3, 16(a0)
+; RV32-NEXT:    sw a6, 20(a0)
+; RV32-NEXT:    sw a5, 24(a0)
+; RV32-NEXT:    sw a1, 28(a0)
+; RV32-NEXT:    sw t3, 0(a0)
+; RV32-NEXT:    sw t4, 4(a0)
+; RV32-NEXT:    sw t5, 8(a0)
+; RV32-NEXT:    sw a2, 12(a0)
+; RV32-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s2, 4(sp) # 4-byte Folded Reload
+; RV32-NEXT:    .cfi_restore s0
+; RV32-NEXT:    .cfi_restore s1
+; RV32-NEXT:    .cfi_restore s2
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    .cfi_def_cfa_offset 0
+; RV32-NEXT:    ret
+  %result = call <8 x i32> @llvm.ct.select.v8i32(i1 %cond, <8 x i32> %a, <8 x 
i32> %b)
+  ret <8 x i32> %result
+}
+
 ; Declare the intrinsics
+declare i1 @llvm.ct.select.i1(i1, i1, i1)
 declare i8 @llvm.ct.select.i8(i1, i8, i8)
 declare i16 @llvm.ct.select.i16(i1, i16, i16)
 declare i32 @llvm.ct.select.i32(i1, i32, i32)
 declare i64 @llvm.ct.select.i64(i1, i64, i64)
 declare ptr @llvm.ct.select.p0(i1, ptr, ptr)
+declare float @llvm.ct.select.f32(i1, float, float)
+declare double @llvm.ct.select.f64(i1, double, double)
+
+; Vector intrinsics
+declare <4 x i32> @llvm.ct.select.v4i32(i1, <4 x i32>, <4 x i32>)
+declare <2 x i64> @llvm.ct.select.v2i64(i1, <2 x i64>, <2 x i64>)
+declare <8 x i16> @llvm.ct.select.v8i16(i1, <8 x i16>, <8 x i16>)
+declare <16 x i8> @llvm.ct.select.v16i8(i1, <16 x i8>, <16 x i8>)
+declare <4 x float> @llvm.ct.select.v4f32(i1, <4 x float>, <4 x float>)
+declare <2 x double> @llvm.ct.select.v2f64(i1, <2 x double>, <2 x double>)
+declare <8 x i32> @llvm.ct.select.v8i32(i1, <8 x i32>, <8 x i32>)
diff --git a/llvm/test/CodeGen/X86/ctselect.ll 
b/llvm/test/CodeGen/X86/ctselect.ll
index 095787a5e2a4b..2b6091c880637 100644
--- a/llvm/test/CodeGen/X86/ctselect.ll
+++ b/llvm/test/CodeGen/X86/ctselect.ll
@@ -8,120 +8,75 @@
 define i8 @test_ctselect_i8(i1 %cond, i8 %a, i8 %b) {
 ; X64-LABEL: test_ctselect_i8:
 ; X64:       # %bb.0:
-; X64-NEXT:    # kill: def $edi killed $edi def $rdi
-; X64-NEXT:    andb $1, %dil
-; X64-NEXT:    leal -1(%rdi), %eax
-; X64-NEXT:    movl %edi, %ecx
-; X64-NEXT:    negb %cl
-; X64-NEXT:    andb %sil, %cl
-; X64-NEXT:    andb %dl, %al
-; X64-NEXT:    orb %cl, %al
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    xorl %edx, %esi
+; X64-NEXT:    andb $1, %al
+; X64-NEXT:    negb %al
+; X64-NEXT:    andb %sil, %al
+; X64-NEXT:    xorb %dl, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: test_ctselect_i8:
 ; X32:       # %bb.0:
 ; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    xorb %cl, %dl
 ; X32-NEXT:    andb $1, %al
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    negb %cl
-; X32-NEXT:    andb {{[0-9]+}}(%esp), %cl
-; X32-NEXT:    decb %al
-; X32-NEXT:    andb {{[0-9]+}}(%esp), %al
-; X32-NEXT:    orb %cl, %al
+; X32-NEXT:    negb %al
+; X32-NEXT:    andb %dl, %al
+; X32-NEXT:    xorb %cl, %al
 ; X32-NEXT:    retl
 ;
 ; X32-NOCMOV-LABEL: test_ctselect_i8:
 ; X32-NOCMOV:       # %bb.0:
 ; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    xorb %cl, %dl
 ; X32-NOCMOV-NEXT:    andb $1, %al
-; X32-NOCMOV-NEXT:    movl %eax, %ecx
-; X32-NOCMOV-NEXT:    negb %cl
-; X32-NOCMOV-NEXT:    andb {{[0-9]+}}(%esp), %cl
-; X32-NOCMOV-NEXT:    decb %al
-; X32-NOCMOV-NEXT:    andb {{[0-9]+}}(%esp), %al
-; X32-NOCMOV-NEXT:    orb %cl, %al
+; X32-NOCMOV-NEXT:    negb %al
+; X32-NOCMOV-NEXT:    andb %dl, %al
+; X32-NOCMOV-NEXT:    xorb %cl, %al
 ; X32-NOCMOV-NEXT:    retl
   %result = call i8 @llvm.ct.select.i8(i1 %cond, i8 %a, i8 %b)
   ret i8 %result
 }
 
-define i16 @test_ctselect_i16(i1 %cond, i16 %a, i16 %b) {
-; X64-LABEL: test_ctselect_i16:
+define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) {
+; X64-LABEL: test_ctselect_i32:
 ; X64:       # %bb.0:
-; X64-NEXT:    # kill: def $edi killed $edi def $rdi
-; X64-NEXT:    andl $1, %edi
-; X64-NEXT:    leal -1(%rdi), %ecx
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    xorl %edx, %esi
+; X64-NEXT:    andl $1, %eax
 ; X64-NEXT:    negl %eax
 ; X64-NEXT:    andl %esi, %eax
-; X64-NEXT:    andl %edx, %ecx
-; X64-NEXT:    orl %ecx, %eax
-; X64-NEXT:    # kill: def $ax killed $ax killed $eax
-; X64-NEXT:    retq
-;
-; X32-LABEL: test_ctselect_i16:
-; X32:       # %bb.0:
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    andl $1, %eax
-; X32-NEXT:    leal -1(%eax), %ecx
-; X32-NEXT:    andw {{[0-9]+}}(%esp), %cx
-; X32-NEXT:    negl %eax
-; X32-NEXT:    andw {{[0-9]+}}(%esp), %ax
-; X32-NEXT:    orl %ecx, %eax
-; X32-NEXT:    # kill: def $ax killed $ax killed $eax
-; X32-NEXT:    retl
-;
-; X32-NOCMOV-LABEL: test_ctselect_i16:
-; X32-NOCMOV:       # %bb.0:
-; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    andl $1, %eax
-; X32-NOCMOV-NEXT:    leal -1(%eax), %ecx
-; X32-NOCMOV-NEXT:    andw {{[0-9]+}}(%esp), %cx
-; X32-NOCMOV-NEXT:    negl %eax
-; X32-NOCMOV-NEXT:    andw {{[0-9]+}}(%esp), %ax
-; X32-NOCMOV-NEXT:    orl %ecx, %eax
-; X32-NOCMOV-NEXT:    # kill: def $ax killed $ax killed $eax
-; X32-NOCMOV-NEXT:    retl
-  %result = call i16 @llvm.ct.select.i16(i1 %cond, i16 %a, i16 %b)
-  ret i16 %result
-}
-
-define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) {
-; X64-LABEL: test_ctselect_i32:
-; X64:       # %bb.0:
-; X64-NEXT:    # kill: def $edi killed $edi def $rdi
-; X64-NEXT:    andl $1, %edi
-; X64-NEXT:    leal -1(%rdi), %eax
-; X64-NEXT:    movl %edi, %ecx
-; X64-NEXT:    negl %ecx
-; X64-NEXT:    andl %esi, %ecx
-; X64-NEXT:    andl %edx, %eax
-; X64-NEXT:    orl %ecx, %eax
+; X64-NEXT:    xorl %edx, %eax
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: test_ctselect_i32:
 ; X32:       # %bb.0:
 ; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    xorl %ecx, %edx
 ; X32-NEXT:    andl $1, %eax
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    negl %ecx
-; X32-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    decl %eax
-; X32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    orl %ecx, %eax
+; X32-NEXT:    negl %eax
+; X32-NEXT:    andl %edx, %eax
+; X32-NEXT:    xorl %ecx, %eax
 ; X32-NEXT:    retl
 ;
 ; X32-NOCMOV-LABEL: test_ctselect_i32:
 ; X32-NOCMOV:       # %bb.0:
 ; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    xorl %ecx, %edx
 ; X32-NOCMOV-NEXT:    andl $1, %eax
-; X32-NOCMOV-NEXT:    movl %eax, %ecx
-; X32-NOCMOV-NEXT:    negl %ecx
-; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    decl %eax
-; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    orl %ecx, %eax
+; X32-NOCMOV-NEXT:    negl %eax
+; X32-NOCMOV-NEXT:    andl %edx, %eax
+; X32-NOCMOV-NEXT:    xorl %ecx, %eax
 ; X32-NOCMOV-NEXT:    retl
   %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
   ret i32 %result
@@ -130,13 +85,12 @@ define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) {
 define i64 @test_ctselect_i64(i1 %cond, i64 %a, i64 %b) {
 ; X64-LABEL: test_ctselect_i64:
 ; X64:       # %bb.0:
-; X64-NEXT:    # kill: def $edi killed $edi def $rdi
-; X64-NEXT:    andl $1, %edi
-; X64-NEXT:    leaq -1(%rdi), %rax
-; X64-NEXT:    negq %rdi
-; X64-NEXT:    andq %rsi, %rdi
-; X64-NEXT:    andq %rdx, %rax
-; X64-NEXT:    orq %rdi, %rax
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    xorq %rdx, %rsi
+; X64-NEXT:    andl $1, %eax
+; X64-NEXT:    negq %rax
+; X64-NEXT:    andq %rsi, %rax
+; X64-NEXT:    xorq %rdx, %rax
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: test_ctselect_i64:
@@ -190,14 +144,12 @@ define float @test_ctselect_f32(i1 %cond, float %a, float 
%b) {
 ; X64-LABEL: test_ctselect_f32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movd %xmm1, %eax
+; X64-NEXT:    pxor %xmm1, %xmm0
 ; X64-NEXT:    movd %xmm0, %ecx
 ; X64-NEXT:    andl $1, %edi
-; X64-NEXT:    movl %edi, %edx
-; X64-NEXT:    negl %edx
-; X64-NEXT:    andl %ecx, %edx
-; X64-NEXT:    decl %edi
-; X64-NEXT:    andl %eax, %edi
-; X64-NEXT:    orl %edx, %edi
+; X64-NEXT:    negl %edi
+; X64-NEXT:    andl %ecx, %edi
+; X64-NEXT:    xorl %eax, %edi
 ; X64-NEXT:    movd %edi, %xmm0
 ; X64-NEXT:    retq
 ;
@@ -206,13 +158,13 @@ define float @test_ctselect_f32(i1 %cond, float %a, float 
%b) {
 ; X32-NEXT:    pushl %eax
 ; X32-NEXT:    .cfi_def_cfa_offset 8
 ; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    xorl %ecx, %edx
 ; X32-NEXT:    andl $1, %eax
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    negl %ecx
-; X32-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    decl %eax
-; X32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    orl %ecx, %eax
+; X32-NEXT:    negl %eax
+; X32-NEXT:    andl %edx, %eax
+; X32-NEXT:    xorl %ecx, %eax
 ; X32-NEXT:    movl %eax, (%esp)
 ; X32-NEXT:    flds (%esp)
 ; X32-NEXT:    popl %eax
@@ -224,13 +176,13 @@ define float @test_ctselect_f32(i1 %cond, float %a, float 
%b) {
 ; X32-NOCMOV-NEXT:    pushl %eax
 ; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
 ; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    xorl %ecx, %edx
 ; X32-NOCMOV-NEXT:    andl $1, %eax
-; X32-NOCMOV-NEXT:    movl %eax, %ecx
-; X32-NOCMOV-NEXT:    negl %ecx
-; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    decl %eax
-; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    orl %ecx, %eax
+; X32-NOCMOV-NEXT:    negl %eax
+; X32-NOCMOV-NEXT:    andl %edx, %eax
+; X32-NOCMOV-NEXT:    xorl %ecx, %eax
 ; X32-NOCMOV-NEXT:    movl %eax, (%esp)
 ; X32-NOCMOV-NEXT:    flds (%esp)
 ; X32-NOCMOV-NEXT:    popl %eax
@@ -245,14 +197,12 @@ define double @test_ctselect_f64(i1 %cond, double %a, 
double %b) {
 ; X64:       # %bb.0:
 ; X64-NEXT:    # kill: def $edi killed $edi def $rdi
 ; X64-NEXT:    movq %xmm1, %rax
+; X64-NEXT:    pxor %xmm1, %xmm0
 ; X64-NEXT:    movq %xmm0, %rcx
 ; X64-NEXT:    andl $1, %edi
-; X64-NEXT:    movq %rdi, %rdx
-; X64-NEXT:    negq %rdx
-; X64-NEXT:    andq %rcx, %rdx
-; X64-NEXT:    decq %rdi
-; X64-NEXT:    andq %rax, %rdi
-; X64-NEXT:    orq %rdx, %rdi
+; X64-NEXT:    negq %rdi
+; X64-NEXT:    andq %rcx, %rdi
+; X64-NEXT:    xorq %rax, %rdi
 ; X64-NEXT:    movq %rdi, %xmm0
 ; X64-NEXT:    retq
 ;
@@ -320,37 +270,36 @@ define double @test_ctselect_f64(i1 %cond, double %a, 
double %b) {
 define ptr @test_ctselect_ptr(i1 %cond, ptr %a, ptr %b) {
 ; X64-LABEL: test_ctselect_ptr:
 ; X64:       # %bb.0:
-; X64-NEXT:    # kill: def $edi killed $edi def $rdi
-; X64-NEXT:    andl $1, %edi
-; X64-NEXT:    leaq -1(%rdi), %rax
-; X64-NEXT:    negq %rdi
-; X64-NEXT:    andq %rsi, %rdi
-; X64-NEXT:    andq %rdx, %rax
-; X64-NEXT:    orq %rdi, %rax
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    xorq %rdx, %rsi
+; X64-NEXT:    andl $1, %eax
+; X64-NEXT:    negq %rax
+; X64-NEXT:    andq %rsi, %rax
+; X64-NEXT:    xorq %rdx, %rax
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: test_ctselect_ptr:
 ; X32:       # %bb.0:
 ; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    xorl %ecx, %edx
 ; X32-NEXT:    andl $1, %eax
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    negl %ecx
-; X32-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    decl %eax
-; X32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    orl %ecx, %eax
+; X32-NEXT:    negl %eax
+; X32-NEXT:    andl %edx, %eax
+; X32-NEXT:    xorl %ecx, %eax
 ; X32-NEXT:    retl
 ;
 ; X32-NOCMOV-LABEL: test_ctselect_ptr:
 ; X32-NOCMOV:       # %bb.0:
 ; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    xorl %ecx, %edx
 ; X32-NOCMOV-NEXT:    andl $1, %eax
-; X32-NOCMOV-NEXT:    movl %eax, %ecx
-; X32-NOCMOV-NEXT:    negl %ecx
-; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    decl %eax
-; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    orl %ecx, %eax
+; X32-NOCMOV-NEXT:    negl %eax
+; X32-NOCMOV-NEXT:    andl %edx, %eax
+; X32-NOCMOV-NEXT:    xorl %ecx, %eax
 ; X32-NOCMOV-NEXT:    retl
   %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %a, ptr %b)
   ret ptr %result
@@ -361,16 +310,24 @@ define i32 @test_ctselect_const_true(i32 %a, i32 %b) {
 ; X64-LABEL: test_ctselect_const_true:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    xorl %esi, %eax
+; X64-NEXT:    xorl %esi, %eax
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: test_ctselect_const_true:
 ; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    xorl %ecx, %eax
+; X32-NEXT:    xorl %ecx, %eax
 ; X32-NEXT:    retl
 ;
 ; X32-NOCMOV-LABEL: test_ctselect_const_true:
 ; X32-NOCMOV:       # %bb.0:
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    xorl %ecx, %eax
+; X32-NOCMOV-NEXT:    xorl %ecx, %eax
 ; X32-NOCMOV-NEXT:    retl
   %result = call i32 @llvm.ct.select.i32(i1 true, i32 %a, i32 %b)
   ret i32 %result
@@ -385,13 +342,13 @@ define i32 @test_ctselect_const_false(i32 %a, i32 %b) {
 ; X32-LABEL: test_ctselect_const_false:
 ; X32:       # %bb.0:
 ; X32-NEXT:    xorl %eax, %eax
-; X32-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    retl
 ;
 ; X32-NOCMOV-LABEL: test_ctselect_const_false:
 ; X32-NOCMOV:       # %bb.0:
 ; X32-NOCMOV-NEXT:    xorl %eax, %eax
-; X32-NOCMOV-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    xorl {{[0-9]+}}(%esp), %eax
 ; X32-NOCMOV-NEXT:    retl
   %result = call i32 @llvm.ct.select.i32(i1 false, i32 %a, i32 %b)
   ret i32 %result
@@ -404,174 +361,79 @@ define i32 @test_ctselect_icmp_eq(i32 %x, i32 %y, i32 
%a, i32 %b) {
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    cmpl %esi, %edi
 ; X64-NEXT:    sete %al
-; X64-NEXT:    movl %eax, %esi
-; X64-NEXT:    negl %esi
-; X64-NEXT:    andl %edx, %esi
-; X64-NEXT:    decl %eax
-; X64-NEXT:    andl %ecx, %eax
-; X64-NEXT:    orl %esi, %eax
+; X64-NEXT:    xorl %ecx, %edx
+; X64-NEXT:    negl %eax
+; X64-NEXT:    andl %edx, %eax
+; X64-NEXT:    xorl %ecx, %eax
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: test_ctselect_icmp_eq:
 ; X32:       # %bb.0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    xorl %eax, %eax
-; X32-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    cmpl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    sete %al
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    negl %ecx
-; X32-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    decl %eax
-; X32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    orl %ecx, %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    xorl %ecx, %edx
+; X32-NEXT:    negl %eax
+; X32-NEXT:    andl %edx, %eax
+; X32-NEXT:    xorl %ecx, %eax
 ; X32-NEXT:    retl
 ;
 ; X32-NOCMOV-LABEL: test_ctselect_icmp_eq:
 ; X32-NOCMOV:       # %bb.0:
 ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NOCMOV-NEXT:    xorl %eax, %eax
-; X32-NOCMOV-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    cmpl {{[0-9]+}}(%esp), %edx
 ; X32-NOCMOV-NEXT:    sete %al
-; X32-NOCMOV-NEXT:    movl %eax, %ecx
-; X32-NOCMOV-NEXT:    negl %ecx
-; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    decl %eax
-; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    orl %ecx, %eax
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    xorl %ecx, %edx
+; X32-NOCMOV-NEXT:    negl %eax
+; X32-NOCMOV-NEXT:    andl %edx, %eax
+; X32-NOCMOV-NEXT:    xorl %ecx, %eax
 ; X32-NOCMOV-NEXT:    retl
   %cond = icmp eq i32 %x, %y
   %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
   ret i32 %result
 }
 
-define i32 @test_ctselect_icmp_ne(i32 %x, i32 %y, i32 %a, i32 %b) {
-; X64-LABEL: test_ctselect_icmp_ne:
-; X64:       # %bb.0:
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpl %esi, %edi
-; X64-NEXT:    setne %al
-; X64-NEXT:    movl %eax, %esi
-; X64-NEXT:    negl %esi
-; X64-NEXT:    andl %edx, %esi
-; X64-NEXT:    decl %eax
-; X64-NEXT:    andl %ecx, %eax
-; X64-NEXT:    orl %esi, %eax
-; X64-NEXT:    retq
-;
-; X32-LABEL: test_ctselect_icmp_ne:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    xorl %eax, %eax
-; X32-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    setne %al
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    negl %ecx
-; X32-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    decl %eax
-; X32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    orl %ecx, %eax
-; X32-NEXT:    retl
-;
-; X32-NOCMOV-LABEL: test_ctselect_icmp_ne:
-; X32-NOCMOV:       # %bb.0:
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    xorl %eax, %eax
-; X32-NOCMOV-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    setne %al
-; X32-NOCMOV-NEXT:    movl %eax, %ecx
-; X32-NOCMOV-NEXT:    negl %ecx
-; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    decl %eax
-; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    orl %ecx, %eax
-; X32-NOCMOV-NEXT:    retl
-  %cond = icmp ne i32 %x, %y
-  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
-  ret i32 %result
-}
-
-define i32 @test_ctselect_icmp_slt(i32 %x, i32 %y, i32 %a, i32 %b) {
-; X64-LABEL: test_ctselect_icmp_slt:
-; X64:       # %bb.0:
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpl %esi, %edi
-; X64-NEXT:    setl %al
-; X64-NEXT:    movl %eax, %esi
-; X64-NEXT:    negl %esi
-; X64-NEXT:    andl %edx, %esi
-; X64-NEXT:    decl %eax
-; X64-NEXT:    andl %ecx, %eax
-; X64-NEXT:    orl %esi, %eax
-; X64-NEXT:    retq
-;
-; X32-LABEL: test_ctselect_icmp_slt:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    xorl %eax, %eax
-; X32-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    setl %al
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    negl %ecx
-; X32-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    decl %eax
-; X32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    orl %ecx, %eax
-; X32-NEXT:    retl
-;
-; X32-NOCMOV-LABEL: test_ctselect_icmp_slt:
-; X32-NOCMOV:       # %bb.0:
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    xorl %eax, %eax
-; X32-NOCMOV-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    setl %al
-; X32-NOCMOV-NEXT:    movl %eax, %ecx
-; X32-NOCMOV-NEXT:    negl %ecx
-; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    decl %eax
-; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    orl %ecx, %eax
-; X32-NOCMOV-NEXT:    retl
-  %cond = icmp slt i32 %x, %y
-  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
-  ret i32 %result
-}
-
 define i32 @test_ctselect_icmp_ult(i32 %x, i32 %y, i32 %a, i32 %b) {
 ; X64-LABEL: test_ctselect_icmp_ult:
 ; X64:       # %bb.0:
+; X64-NEXT:    xorl %ecx, %edx
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    cmpl %esi, %edi
 ; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    andl %eax, %edx
-; X64-NEXT:    notl %eax
-; X64-NEXT:    andl %ecx, %eax
-; X64-NEXT:    orl %edx, %eax
+; X64-NEXT:    andl %edx, %eax
+; X64-NEXT:    xorl %ecx, %eax
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: test_ctselect_icmp_ult:
 ; X32:       # %bb.0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    xorl %eax, %eax
-; X32-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    sbbl %eax, %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    andl %eax, %ecx
-; X32-NEXT:    notl %eax
-; X32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    orl %ecx, %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    xorl %edx, %edx
+; X32-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    sbbl %edx, %edx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    xorl %ecx, %eax
+; X32-NEXT:    andl %edx, %eax
+; X32-NEXT:    xorl %ecx, %eax
 ; X32-NEXT:    retl
 ;
 ; X32-NOCMOV-LABEL: test_ctselect_icmp_ult:
 ; X32-NOCMOV:       # %bb.0:
 ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    xorl %eax, %eax
-; X32-NOCMOV-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    sbbl %eax, %eax
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    andl %eax, %ecx
-; X32-NOCMOV-NEXT:    notl %eax
-; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    orl %ecx, %eax
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    xorl %edx, %edx
+; X32-NOCMOV-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    sbbl %edx, %edx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    xorl %ecx, %eax
+; X32-NOCMOV-NEXT:    andl %edx, %eax
+; X32-NOCMOV-NEXT:    xorl %ecx, %eax
 ; X32-NOCMOV-NEXT:    retl
   %cond = icmp ult i32 %x, %y
   %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
@@ -583,12 +445,10 @@ define float @test_ctselect_fcmp_oeq(float %x, float %y, 
float %a, float %b) {
 ; X64:       # %bb.0:
 ; X64-NEXT:    movd %xmm3, %eax
 ; X64-NEXT:    cmpeqss %xmm1, %xmm0
-; X64-NEXT:    movd %xmm0, %ecx
-; X64-NEXT:    pand %xmm2, %xmm0
-; X64-NEXT:    movd %xmm0, %edx
-; X64-NEXT:    notl %ecx
-; X64-NEXT:    andl %eax, %ecx
-; X64-NEXT:    orl %edx, %ecx
+; X64-NEXT:    pxor %xmm3, %xmm2
+; X64-NEXT:    pand %xmm0, %xmm2
+; X64-NEXT:    movd %xmm2, %ecx
+; X64-NEXT:    xorl %eax, %ecx
 ; X64-NEXT:    movd %ecx, %xmm0
 ; X64-NEXT:    retq
 ;
@@ -596,21 +456,21 @@ define float @test_ctselect_fcmp_oeq(float %x, float %y, 
float %a, float %b) {
 ; X32:       # %bb.0:
 ; X32-NEXT:    pushl %eax
 ; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    flds {{[0-9]+}}(%esp)
 ; X32-NEXT:    flds {{[0-9]+}}(%esp)
 ; X32-NEXT:    fucompi %st(1), %st
 ; X32-NEXT:    fstp %st(0)
-; X32-NEXT:    setnp %al
-; X32-NEXT:    sete %cl
-; X32-NEXT:    andb %al, %cl
-; X32-NEXT:    movzbl %cl, %eax
-; X32-NEXT:    movl %eax, %ecx
+; X32-NEXT:    setnp %cl
+; X32-NEXT:    sete %dl
+; X32-NEXT:    andb %cl, %dl
+; X32-NEXT:    movzbl %dl, %ecx
 ; X32-NEXT:    negl %ecx
-; X32-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    decl %eax
-; X32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    orl %ecx, %eax
-; X32-NEXT:    movl %eax, (%esp)
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    xorl %eax, %edx
+; X32-NEXT:    andl %ecx, %edx
+; X32-NEXT:    xorl %eax, %edx
+; X32-NEXT:    movl %edx, (%esp)
 ; X32-NEXT:    flds (%esp)
 ; X32-NEXT:    popl %eax
 ; X32-NEXT:    .cfi_def_cfa_offset 4
@@ -620,6 +480,7 @@ define float @test_ctselect_fcmp_oeq(float %x, float %y, 
float %a, float %b) {
 ; X32-NOCMOV:       # %bb.0:
 ; X32-NOCMOV-NEXT:    pushl %eax
 ; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X32-NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X32-NOCMOV-NEXT:    fucompp
@@ -627,16 +488,15 @@ define float @test_ctselect_fcmp_oeq(float %x, float %y, 
float %a, float %b) {
 ; X32-NOCMOV-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X32-NOCMOV-NEXT:    sahf
 ; X32-NOCMOV-NEXT:    setnp %al
-; X32-NOCMOV-NEXT:    sete %cl
-; X32-NOCMOV-NEXT:    andb %al, %cl
-; X32-NOCMOV-NEXT:    movzbl %cl, %eax
-; X32-NOCMOV-NEXT:    movl %eax, %ecx
-; X32-NOCMOV-NEXT:    negl %ecx
-; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    decl %eax
-; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    orl %ecx, %eax
-; X32-NOCMOV-NEXT:    movl %eax, (%esp)
+; X32-NOCMOV-NEXT:    sete %dl
+; X32-NOCMOV-NEXT:    andb %al, %dl
+; X32-NOCMOV-NEXT:    movzbl %dl, %eax
+; X32-NOCMOV-NEXT:    negl %eax
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    xorl %ecx, %edx
+; X32-NOCMOV-NEXT:    andl %eax, %edx
+; X32-NOCMOV-NEXT:    xorl %ecx, %edx
+; X32-NOCMOV-NEXT:    movl %edx, (%esp)
 ; X32-NOCMOV-NEXT:    flds (%esp)
 ; X32-NOCMOV-NEXT:    popl %eax
 ; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
@@ -650,52 +510,41 @@ define float @test_ctselect_fcmp_oeq(float %x, float %y, 
float %a, float %b) {
 define i32 @test_ctselect_load(i1 %cond, ptr %p1, ptr %p2) {
 ; X64-LABEL: test_ctselect_load:
 ; X64:       # %bb.0:
-; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    movl (%rdx), %ecx
+; X64-NEXT:    movl (%rsi), %eax
+; X64-NEXT:    xorl %ecx, %eax
 ; X64-NEXT:    andl $1, %edi
-; X64-NEXT:    leal -1(%rdi), %eax
-; X64-NEXT:    movl %edi, %ecx
-; X64-NEXT:    negl %ecx
-; X64-NEXT:    andl (%rsi), %ecx
-; X64-NEXT:    andl (%rdx), %eax
-; X64-NEXT:    orl %ecx, %eax
+; X64-NEXT:    negl %edi
+; X64-NEXT:    andl %edi, %eax
+; X64-NEXT:    xorl %ecx, %eax
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: test_ctselect_load:
 ; X32:       # %bb.0:
-; X32-NEXT:    pushl %esi
-; X32-NEXT:    .cfi_def_cfa_offset 8
-; X32-NEXT:    .cfi_offset %esi, -8
+; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl (%edx), %edx
+; X32-NEXT:    movl (%ecx), %ecx
+; X32-NEXT:    xorl %edx, %ecx
 ; X32-NEXT:    andl $1, %eax
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    negl %esi
-; X32-NEXT:    andl (%edx), %esi
-; X32-NEXT:    decl %eax
-; X32-NEXT:    andl (%ecx), %eax
-; X32-NEXT:    orl %esi, %eax
-; X32-NEXT:    popl %esi
-; X32-NEXT:    .cfi_def_cfa_offset 4
+; X32-NEXT:    negl %eax
+; X32-NEXT:    andl %ecx, %eax
+; X32-NEXT:    xorl %edx, %eax
 ; X32-NEXT:    retl
 ;
 ; X32-NOCMOV-LABEL: test_ctselect_load:
 ; X32-NOCMOV:       # %bb.0:
-; X32-NOCMOV-NEXT:    pushl %esi
-; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
-; X32-NOCMOV-NEXT:    .cfi_offset %esi, -8
+; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    movl (%edx), %edx
+; X32-NOCMOV-NEXT:    movl (%ecx), %ecx
+; X32-NOCMOV-NEXT:    xorl %edx, %ecx
 ; X32-NOCMOV-NEXT:    andl $1, %eax
-; X32-NOCMOV-NEXT:    movl %eax, %esi
-; X32-NOCMOV-NEXT:    negl %esi
-; X32-NOCMOV-NEXT:    andl (%edx), %esi
-; X32-NOCMOV-NEXT:    decl %eax
-; X32-NOCMOV-NEXT:    andl (%ecx), %eax
-; X32-NOCMOV-NEXT:    orl %esi, %eax
-; X32-NOCMOV-NEXT:    popl %esi
-; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
+; X32-NOCMOV-NEXT:    negl %eax
+; X32-NOCMOV-NEXT:    andl %ecx, %eax
+; X32-NOCMOV-NEXT:    xorl %edx, %eax
 ; X32-NOCMOV-NEXT:    retl
   %a = load i32, ptr %p1
   %b = load i32, ptr %p2
@@ -707,69 +556,753 @@ define i32 @test_ctselect_load(i1 %cond, ptr %p1, ptr 
%p2) {
 define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) 
{
 ; X64-LABEL: test_ctselect_nested:
 ; X64:       # %bb.0:
-; X64-NEXT:    # kill: def $esi killed $esi def $rsi
-; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    xorl %ecx, %edx
 ; X64-NEXT:    andl $1, %esi
-; X64-NEXT:    leal -1(%rsi), %r9d
-; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    negl %esi
+; X64-NEXT:    andl %edx, %esi
+; X64-NEXT:    xorl %r8d, %ecx
+; X64-NEXT:    xorl %esi, %ecx
+; X64-NEXT:    andl $1, %eax
 ; X64-NEXT:    negl %eax
-; X64-NEXT:    andl %edx, %eax
-; X64-NEXT:    andl %ecx, %r9d
-; X64-NEXT:    orl %eax, %r9d
-; X64-NEXT:    andl $1, %edi
-; X64-NEXT:    leal -1(%rdi), %eax
-; X64-NEXT:    movl %edi, %ecx
-; X64-NEXT:    negl %ecx
-; X64-NEXT:    andl %r9d, %ecx
-; X64-NEXT:    andl %r8d, %eax
-; X64-NEXT:    orl %ecx, %eax
+; X64-NEXT:    andl %ecx, %eax
+; X64-NEXT:    xorl %r8d, %eax
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: test_ctselect_nested:
 ; X32:       # %bb.0:
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 12
+; X32-NEXT:    .cfi_offset %esi, -12
+; X32-NEXT:    .cfi_offset %edi, -8
 ; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    andl $1, %ecx
-; X32-NEXT:    movl %ecx, %edx
-; X32-NEXT:    negl %edx
-; X32-NEXT:    andl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    decl %ecx
-; X32-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    orl %edx, %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    xorl %edx, %edi
+; X32-NEXT:    andl $1, %esi
+; X32-NEXT:    negl %esi
+; X32-NEXT:    andl %edi, %esi
+; X32-NEXT:    xorl %ecx, %edx
+; X32-NEXT:    xorl %esi, %edx
 ; X32-NEXT:    andl $1, %eax
-; X32-NEXT:    movl %eax, %edx
-; X32-NEXT:    negl %edx
-; X32-NEXT:    andl %ecx, %edx
-; X32-NEXT:    decl %eax
-; X32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    orl %edx, %eax
+; X32-NEXT:    negl %eax
+; X32-NEXT:    andl %edx, %eax
+; X32-NEXT:    xorl %ecx, %eax
+; X32-NEXT:    popl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    popl %edi
+; X32-NEXT:    .cfi_def_cfa_offset 4
 ; X32-NEXT:    retl
 ;
 ; X32-NOCMOV-LABEL: test_ctselect_nested:
 ; X32-NOCMOV:       # %bb.0:
+; X32-NOCMOV-NEXT:    pushl %edi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    pushl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT:    .cfi_offset %edi, -8
 ; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    andl $1, %ecx
-; X32-NOCMOV-NEXT:    movl %ecx, %edx
-; X32-NOCMOV-NEXT:    negl %edx
-; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT:    decl %ecx
-; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    orl %edx, %ecx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %esi
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NOCMOV-NEXT:    xorl %edx, %edi
+; X32-NOCMOV-NEXT:    andl $1, %esi
+; X32-NOCMOV-NEXT:    negl %esi
+; X32-NOCMOV-NEXT:    andl %edi, %esi
+; X32-NOCMOV-NEXT:    xorl %ecx, %edx
+; X32-NOCMOV-NEXT:    xorl %esi, %edx
 ; X32-NOCMOV-NEXT:    andl $1, %eax
-; X32-NOCMOV-NEXT:    movl %eax, %edx
-; X32-NOCMOV-NEXT:    negl %edx
-; X32-NOCMOV-NEXT:    andl %ecx, %edx
-; X32-NOCMOV-NEXT:    decl %eax
-; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    orl %edx, %eax
+; X32-NOCMOV-NEXT:    negl %eax
+; X32-NOCMOV-NEXT:    andl %edx, %eax
+; X32-NOCMOV-NEXT:    xorl %ecx, %eax
+; X32-NOCMOV-NEXT:    popl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    popl %edi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
 ; X32-NOCMOV-NEXT:    retl
   %inner = call i32 @llvm.ct.select.i32(i1 %cond2, i32 %a, i32 %b)
   %result = call i32 @llvm.ct.select.i32(i1 %cond1, i32 %inner, i32 %c)
   ret i32 %result
 }
 
+; Test nested CTSELECT pattern with AND merging on i1 values
+; Pattern: ctselect C0, (ctselect C1, X, Y), Y -> ctselect (C0 & C1), X, Y
+; This optimization only applies when selecting between i1 values (boolean 
logic)
+define i32 @test_ctselect_nested_and_i1_to_i32(i1 %c0, i1 %c1, i32 %x, i32 %y) 
{
+; X64-LABEL: test_ctselect_nested_and_i1_to_i32:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    andl %esi, %eax
+; X64-NEXT:    xorl %ecx, %edx
+; X64-NEXT:    andl $1, %eax
+; X64-NEXT:    negl %eax
+; X64-NEXT:    andl %edx, %eax
+; X64-NEXT:    xorl %ecx, %eax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_nested_and_i1_to_i32:
+; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    andb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    movzbl %al, %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    xorl %ecx, %edx
+; X32-NEXT:    andl $1, %eax
+; X32-NEXT:    negl %eax
+; X32-NEXT:    andl %edx, %eax
+; X32-NEXT:    xorl %ecx, %eax
+; X32-NEXT:    retl
+;
+; X32-NOCMOV-LABEL: test_ctselect_nested_and_i1_to_i32:
+; X32-NOCMOV:       # %bb.0:
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    andb {{[0-9]+}}(%esp), %al
+; X32-NOCMOV-NEXT:    movzbl %al, %eax
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    xorl %ecx, %edx
+; X32-NOCMOV-NEXT:    andl $1, %eax
+; X32-NOCMOV-NEXT:    negl %eax
+; X32-NOCMOV-NEXT:    andl %edx, %eax
+; X32-NOCMOV-NEXT:    xorl %ecx, %eax
+; X32-NOCMOV-NEXT:    retl
+  %inner = call i1 @llvm.ct.select.i1(i1 %c1, i1 true, i1 false)
+  %cond = call i1 @llvm.ct.select.i1(i1 %c0, i1 %inner, i1 false)
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %x, i32 %y)
+  ret i32 %result
+}
+
+; Test nested CTSELECT pattern with OR merging on i1 values
+; Pattern: ctselect C0, X, (ctselect C1, X, Y) -> ctselect (C0 | C1), X, Y
+; This optimization only applies when selecting between i1 values (boolean 
logic)
+define i32 @test_ctselect_nested_or_i1_to_i32(i1 %c0, i1 %c1, i32 %x, i32 %y) {
+; X64-LABEL: test_ctselect_nested_or_i1_to_i32:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    orl %esi, %eax
+; X64-NEXT:    xorl %ecx, %edx
+; X64-NEXT:    andl $1, %eax
+; X64-NEXT:    negl %eax
+; X64-NEXT:    andl %edx, %eax
+; X64-NEXT:    xorl %ecx, %eax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_nested_or_i1_to_i32:
+; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    orb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    movzbl %al, %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    xorl %ecx, %edx
+; X32-NEXT:    andl $1, %eax
+; X32-NEXT:    negl %eax
+; X32-NEXT:    andl %edx, %eax
+; X32-NEXT:    xorl %ecx, %eax
+; X32-NEXT:    retl
+;
+; X32-NOCMOV-LABEL: test_ctselect_nested_or_i1_to_i32:
+; X32-NOCMOV:       # %bb.0:
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    orb {{[0-9]+}}(%esp), %al
+; X32-NOCMOV-NEXT:    movzbl %al, %eax
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    xorl %ecx, %edx
+; X32-NOCMOV-NEXT:    andl $1, %eax
+; X32-NOCMOV-NEXT:    negl %eax
+; X32-NOCMOV-NEXT:    andl %edx, %eax
+; X32-NOCMOV-NEXT:    xorl %ecx, %eax
+; X32-NOCMOV-NEXT:    retl
+  %inner = call i1 @llvm.ct.select.i1(i1 %c1, i1 true, i1 false)
+  %cond = call i1 @llvm.ct.select.i1(i1 %c0, i1 true, i1 %inner)
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %x, i32 %y)
+  ret i32 %result
+}
+
+; Test double nested CTSELECT with recursive AND merging
+; Pattern: ctselect C0, (ctselect C1, (ctselect C2, X, Y), Y), Y
+;   -> ctselect C0, (ctselect (C1 & C2), X, Y), Y
+;   -> ctselect (C0 & (C1 & C2)), X, Y
+; This tests that the optimization can be applied recursively
+define i32 @test_ctselect_double_nested_and_i1(i1 %c0, i1 %c1, i1 %c2, i32 %x, 
i32 %y) {
+; X64-LABEL: test_ctselect_double_nested_and_i1:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    andl %edx, %eax
+; X64-NEXT:    andl %edi, %eax
+; X64-NEXT:    xorl %r8d, %ecx
+; X64-NEXT:    andl $1, %eax
+; X64-NEXT:    negl %eax
+; X64-NEXT:    andl %ecx, %eax
+; X64-NEXT:    xorl %r8d, %eax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_double_nested_and_i1:
+; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    andb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    andb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    movzbl %al, %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    xorl %ecx, %edx
+; X32-NEXT:    andl $1, %eax
+; X32-NEXT:    negl %eax
+; X32-NEXT:    andl %edx, %eax
+; X32-NEXT:    xorl %ecx, %eax
+; X32-NEXT:    retl
+;
+; X32-NOCMOV-LABEL: test_ctselect_double_nested_and_i1:
+; X32-NOCMOV:       # %bb.0:
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    andb {{[0-9]+}}(%esp), %al
+; X32-NOCMOV-NEXT:    andb {{[0-9]+}}(%esp), %al
+; X32-NOCMOV-NEXT:    movzbl %al, %eax
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    xorl %ecx, %edx
+; X32-NOCMOV-NEXT:    andl $1, %eax
+; X32-NOCMOV-NEXT:    negl %eax
+; X32-NOCMOV-NEXT:    andl %edx, %eax
+; X32-NOCMOV-NEXT:    xorl %ecx, %eax
+; X32-NOCMOV-NEXT:    retl
+  %inner2 = call i1 @llvm.ct.select.i1(i1 %c2, i1 true, i1 false)
+  %inner1 = call i1 @llvm.ct.select.i1(i1 %c1, i1 %inner2, i1 false)
+  %cond = call i1 @llvm.ct.select.i1(i1 %c0, i1 %inner1, i1 false)
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %x, i32 %y)
+  ret i32 %result
+}
+
+; Vector CTSELECT Tests
+; ============================================================================
+
+; Test vector CTSELECT with v4i32 (128-bit vector with single i1 mask)
+; NOW CONSTANT-TIME: Uses bitwise XOR/AND operations instead of branches!
+define <4 x i32> @test_ctselect_v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) {
+; X64-LABEL: test_ctselect_v4i32:
+; X64:       # %bb.0:
+; X64-NEXT:    pxor %xmm1, %xmm0
+; X64-NEXT:    movd %edi, %xmm2
+; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; X64-NEXT:    pslld $31, %xmm2
+; X64-NEXT:    psrad $31, %xmm2
+; X64-NEXT:    pand %xmm2, %xmm0
+; X64-NEXT:    pxor %xmm1, %xmm0
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_v4i32:
+; X32:       # %bb.0:
+; X32-NEXT:    pushl %ebp
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    .cfi_def_cfa_offset 12
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    .cfi_def_cfa_offset 16
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 20
+; X32-NEXT:    .cfi_offset %esi, -20
+; X32-NEXT:    .cfi_offset %edi, -16
+; X32-NEXT:    .cfi_offset %ebx, -12
+; X32-NEXT:    .cfi_offset %ebp, -8
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    xorl %ebx, %edx
+; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    andl $1, %edi
+; X32-NEXT:    negl %edi
+; X32-NEXT:    andl %edi, %edx
+; X32-NEXT:    xorl %ebx, %edx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT:    xorl %ebp, %ebx
+; X32-NEXT:    andl %edi, %ebx
+; X32-NEXT:    xorl %ebp, %ebx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X32-NEXT:    xorl %esi, %ebp
+; X32-NEXT:    andl %edi, %ebp
+; X32-NEXT:    xorl %esi, %ebp
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    xorl %ecx, %esi
+; X32-NEXT:    andl %edi, %esi
+; X32-NEXT:    xorl %ecx, %esi
+; X32-NEXT:    movl %esi, 12(%eax)
+; X32-NEXT:    movl %ebp, 8(%eax)
+; X32-NEXT:    movl %ebx, 4(%eax)
+; X32-NEXT:    movl %edx, (%eax)
+; X32-NEXT:    popl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 16
+; X32-NEXT:    popl %edi
+; X32-NEXT:    .cfi_def_cfa_offset 12
+; X32-NEXT:    popl %ebx
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    popl %ebp
+; X32-NEXT:    .cfi_def_cfa_offset 4
+; X32-NEXT:    retl $4
+;
+; X32-NOCMOV-LABEL: test_ctselect_v4i32:
+; X32-NOCMOV:       # %bb.0:
+; X32-NOCMOV-NEXT:    pushl %ebp
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    pushl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    pushl %edi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
+; X32-NOCMOV-NEXT:    pushl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 20
+; X32-NOCMOV-NEXT:    .cfi_offset %esi, -20
+; X32-NOCMOV-NEXT:    .cfi_offset %edi, -16
+; X32-NOCMOV-NEXT:    .cfi_offset %ebx, -12
+; X32-NOCMOV-NEXT:    .cfi_offset %ebp, -8
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    xorl %ebx, %edx
+; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %edi
+; X32-NOCMOV-NEXT:    andl $1, %edi
+; X32-NOCMOV-NEXT:    negl %edi
+; X32-NOCMOV-NEXT:    andl %edi, %edx
+; X32-NOCMOV-NEXT:    xorl %ebx, %edx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NOCMOV-NEXT:    xorl %ebp, %ebx
+; X32-NOCMOV-NEXT:    andl %edi, %ebx
+; X32-NOCMOV-NEXT:    xorl %ebp, %ebx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X32-NOCMOV-NEXT:    xorl %esi, %ebp
+; X32-NOCMOV-NEXT:    andl %edi, %ebp
+; X32-NOCMOV-NEXT:    xorl %esi, %ebp
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NOCMOV-NEXT:    xorl %ecx, %esi
+; X32-NOCMOV-NEXT:    andl %edi, %esi
+; X32-NOCMOV-NEXT:    xorl %ecx, %esi
+; X32-NOCMOV-NEXT:    movl %esi, 12(%eax)
+; X32-NOCMOV-NEXT:    movl %ebp, 8(%eax)
+; X32-NOCMOV-NEXT:    movl %ebx, 4(%eax)
+; X32-NOCMOV-NEXT:    movl %edx, (%eax)
+; X32-NOCMOV-NEXT:    popl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
+; X32-NOCMOV-NEXT:    popl %edi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    popl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    popl %ebp
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
+; X32-NOCMOV-NEXT:    retl $4
+  %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x 
i32> %b)
+  ret <4 x i32> %result
+}
+define <4 x float> @test_ctselect_v4f32(i1 %cond, <4 x float> %a, <4 x float> 
%b) {
+; X64-LABEL: test_ctselect_v4f32:
+; X64:       # %bb.0:
+; X64-NEXT:    pxor %xmm1, %xmm0
+; X64-NEXT:    movd %edi, %xmm2
+; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; X64-NEXT:    pslld $31, %xmm2
+; X64-NEXT:    psrad $31, %xmm2
+; X64-NEXT:    pand %xmm2, %xmm0
+; X64-NEXT:    pxor %xmm1, %xmm0
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_v4f32:
+; X32:       # %bb.0:
+; X32-NEXT:    pushl %ebp
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    .cfi_def_cfa_offset 12
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    .cfi_def_cfa_offset 16
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 20
+; X32-NEXT:    .cfi_offset %esi, -20
+; X32-NEXT:    .cfi_offset %edi, -16
+; X32-NEXT:    .cfi_offset %ebx, -12
+; X32-NEXT:    .cfi_offset %ebp, -8
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    xorl %ebx, %edx
+; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    andl $1, %edi
+; X32-NEXT:    negl %edi
+; X32-NEXT:    andl %edi, %edx
+; X32-NEXT:    xorl %ebx, %edx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT:    xorl %ebp, %ebx
+; X32-NEXT:    andl %edi, %ebx
+; X32-NEXT:    xorl %ebp, %ebx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X32-NEXT:    xorl %esi, %ebp
+; X32-NEXT:    andl %edi, %ebp
+; X32-NEXT:    xorl %esi, %ebp
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    xorl %ecx, %esi
+; X32-NEXT:    andl %edi, %esi
+; X32-NEXT:    xorl %ecx, %esi
+; X32-NEXT:    movl %esi, 12(%eax)
+; X32-NEXT:    movl %ebp, 8(%eax)
+; X32-NEXT:    movl %ebx, 4(%eax)
+; X32-NEXT:    movl %edx, (%eax)
+; X32-NEXT:    popl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 16
+; X32-NEXT:    popl %edi
+; X32-NEXT:    .cfi_def_cfa_offset 12
+; X32-NEXT:    popl %ebx
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    popl %ebp
+; X32-NEXT:    .cfi_def_cfa_offset 4
+; X32-NEXT:    retl $4
+;
+; X32-NOCMOV-LABEL: test_ctselect_v4f32:
+; X32-NOCMOV:       # %bb.0:
+; X32-NOCMOV-NEXT:    pushl %ebp
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    pushl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    pushl %edi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
+; X32-NOCMOV-NEXT:    pushl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 20
+; X32-NOCMOV-NEXT:    .cfi_offset %esi, -20
+; X32-NOCMOV-NEXT:    .cfi_offset %edi, -16
+; X32-NOCMOV-NEXT:    .cfi_offset %ebx, -12
+; X32-NOCMOV-NEXT:    .cfi_offset %ebp, -8
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    xorl %ebx, %edx
+; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %edi
+; X32-NOCMOV-NEXT:    andl $1, %edi
+; X32-NOCMOV-NEXT:    negl %edi
+; X32-NOCMOV-NEXT:    andl %edi, %edx
+; X32-NOCMOV-NEXT:    xorl %ebx, %edx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NOCMOV-NEXT:    xorl %ebp, %ebx
+; X32-NOCMOV-NEXT:    andl %edi, %ebx
+; X32-NOCMOV-NEXT:    xorl %ebp, %ebx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X32-NOCMOV-NEXT:    xorl %esi, %ebp
+; X32-NOCMOV-NEXT:    andl %edi, %ebp
+; X32-NOCMOV-NEXT:    xorl %esi, %ebp
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NOCMOV-NEXT:    xorl %ecx, %esi
+; X32-NOCMOV-NEXT:    andl %edi, %esi
+; X32-NOCMOV-NEXT:    xorl %ecx, %esi
+; X32-NOCMOV-NEXT:    movl %esi, 12(%eax)
+; X32-NOCMOV-NEXT:    movl %ebp, 8(%eax)
+; X32-NOCMOV-NEXT:    movl %ebx, 4(%eax)
+; X32-NOCMOV-NEXT:    movl %edx, (%eax)
+; X32-NOCMOV-NEXT:    popl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
+; X32-NOCMOV-NEXT:    popl %edi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    popl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    popl %ebp
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
+; X32-NOCMOV-NEXT:    retl $4
+  %result = call <4 x float> @llvm.ct.select.v4f32(i1 %cond, <4 x float> %a, 
<4 x float> %b)
+  ret <4 x float> %result
+}
+
+define <8 x i32> @test_ctselect_v8i32_avx(i1 %cond, <8 x i32> %a, <8 x i32> 
%b) {
+; X64-LABEL: test_ctselect_v8i32_avx:
+; X64:       # %bb.0:
+; X64-NEXT:    movd %edi, %xmm4
+; X64-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
+; X64-NEXT:    pslld $31, %xmm4
+; X64-NEXT:    psrad $31, %xmm4
+; X64-NEXT:    movdqa %xmm4, %xmm5
+; X64-NEXT:    pandn %xmm2, %xmm5
+; X64-NEXT:    pand %xmm4, %xmm0
+; X64-NEXT:    por %xmm5, %xmm0
+; X64-NEXT:    pand %xmm4, %xmm1
+; X64-NEXT:    pandn %xmm3, %xmm4
+; X64-NEXT:    por %xmm4, %xmm1
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_v8i32_avx:
+; X32:       # %bb.0:
+; X32-NEXT:    pushl %ebp
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    .cfi_def_cfa_offset 12
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    .cfi_def_cfa_offset 16
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 20
+; X32-NEXT:    subl $8, %esp
+; X32-NEXT:    .cfi_def_cfa_offset 28
+; X32-NEXT:    .cfi_offset %esi, -20
+; X32-NEXT:    .cfi_offset %edi, -16
+; X32-NEXT:    .cfi_offset %ebx, -12
+; X32-NEXT:    .cfi_offset %ebp, -8
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    xorl %eax, %ecx
+; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    andl $1, %edx
+; X32-NEXT:    negl %edx
+; X32-NEXT:    andl %edx, %ecx
+; X32-NEXT:    xorl %eax, %ecx
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    xorl %esi, %eax
+; X32-NEXT:    andl %edx, %eax
+; X32-NEXT:    xorl %esi, %eax
+; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    xorl %ebx, %esi
+; X32-NEXT:    andl %edx, %esi
+; X32-NEXT:    xorl %ebx, %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT:    xorl %ebp, %ebx
+; X32-NEXT:    andl %edx, %ebx
+; X32-NEXT:    xorl %ebp, %ebx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X32-NEXT:    xorl %edi, %ebp
+; X32-NEXT:    andl %edx, %ebp
+; X32-NEXT:    xorl %edi, %ebp
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    xorl %eax, %edi
+; X32-NEXT:    andl %edx, %edi
+; X32-NEXT:    xorl %eax, %edi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    xorl %eax, %ecx
+; X32-NEXT:    andl %edx, %ecx
+; X32-NEXT:    xorl %eax, %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    andl %edx, %eax
+; X32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movl %eax, 28(%edx)
+; X32-NEXT:    movl %ecx, 24(%edx)
+; X32-NEXT:    movl %edi, 20(%edx)
+; X32-NEXT:    movl %ebp, 16(%edx)
+; X32-NEXT:    movl %ebx, 12(%edx)
+; X32-NEXT:    movl %esi, 8(%edx)
+; X32-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X32-NEXT:    movl %eax, 4(%edx)
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    movl %eax, (%edx)
+; X32-NEXT:    movl %edx, %eax
+; X32-NEXT:    addl $8, %esp
+; X32-NEXT:    .cfi_def_cfa_offset 20
+; X32-NEXT:    popl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 16
+; X32-NEXT:    popl %edi
+; X32-NEXT:    .cfi_def_cfa_offset 12
+; X32-NEXT:    popl %ebx
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    popl %ebp
+; X32-NEXT:    .cfi_def_cfa_offset 4
+; X32-NEXT:    retl $4
+;
+; X32-NOCMOV-LABEL: test_ctselect_v8i32_avx:
+; X32-NOCMOV:       # %bb.0:
+; X32-NOCMOV-NEXT:    pushl %ebp
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    pushl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    pushl %edi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
+; X32-NOCMOV-NEXT:    pushl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 20
+; X32-NOCMOV-NEXT:    subl $8, %esp
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 28
+; X32-NOCMOV-NEXT:    .cfi_offset %esi, -20
+; X32-NOCMOV-NEXT:    .cfi_offset %edi, -16
+; X32-NOCMOV-NEXT:    .cfi_offset %ebx, -12
+; X32-NOCMOV-NEXT:    .cfi_offset %ebp, -8
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    xorl %eax, %ecx
+; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    andl $1, %edx
+; X32-NOCMOV-NEXT:    negl %edx
+; X32-NOCMOV-NEXT:    andl %edx, %ecx
+; X32-NOCMOV-NEXT:    xorl %eax, %ecx
+; X32-NOCMOV-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    xorl %esi, %eax
+; X32-NOCMOV-NEXT:    andl %edx, %eax
+; X32-NOCMOV-NEXT:    xorl %esi, %eax
+; X32-NOCMOV-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NOCMOV-NEXT:    xorl %ebx, %esi
+; X32-NOCMOV-NEXT:    andl %edx, %esi
+; X32-NOCMOV-NEXT:    xorl %ebx, %esi
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NOCMOV-NEXT:    xorl %ebp, %ebx
+; X32-NOCMOV-NEXT:    andl %edx, %ebx
+; X32-NOCMOV-NEXT:    xorl %ebp, %ebx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X32-NOCMOV-NEXT:    xorl %edi, %ebp
+; X32-NOCMOV-NEXT:    andl %edx, %ebp
+; X32-NOCMOV-NEXT:    xorl %edi, %ebp
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NOCMOV-NEXT:    xorl %eax, %edi
+; X32-NOCMOV-NEXT:    andl %edx, %edi
+; X32-NOCMOV-NEXT:    xorl %eax, %edi
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    xorl %eax, %ecx
+; X32-NOCMOV-NEXT:    andl %edx, %ecx
+; X32-NOCMOV-NEXT:    xorl %eax, %ecx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    xorl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    andl %edx, %eax
+; X32-NOCMOV-NEXT:    xorl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    movl %eax, 28(%edx)
+; X32-NOCMOV-NEXT:    movl %ecx, 24(%edx)
+; X32-NOCMOV-NEXT:    movl %edi, 20(%edx)
+; X32-NOCMOV-NEXT:    movl %ebp, 16(%edx)
+; X32-NOCMOV-NEXT:    movl %ebx, 12(%edx)
+; X32-NOCMOV-NEXT:    movl %esi, 8(%edx)
+; X32-NOCMOV-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X32-NOCMOV-NEXT:    movl %eax, 4(%edx)
+; X32-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NOCMOV-NEXT:    movl %eax, (%edx)
+; X32-NOCMOV-NEXT:    movl %edx, %eax
+; X32-NOCMOV-NEXT:    addl $8, %esp
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 20
+; X32-NOCMOV-NEXT:    popl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
+; X32-NOCMOV-NEXT:    popl %edi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    popl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    popl %ebp
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
+; X32-NOCMOV-NEXT:    retl $4
+  %result = call <8 x i32> @llvm.ct.select.v8i32(i1 %cond, <8 x i32> %a, <8 x 
i32> %b)
+  ret <8 x i32> %result
+}
+
+define float @test_ctselect_f32_nan_inf(i1 %cond) {
+; X64-LABEL: test_ctselect_f32_nan_inf:
+; X64:       # %bb.0:
+; X64-NEXT:    andl $1, %edi
+; X64-NEXT:    negl %edi
+; X64-NEXT:    andl $4194304, %edi # imm = 0x400000
+; X64-NEXT:    xorl $2139095040, %edi # imm = 0x7F800000
+; X64-NEXT:    movd %edi, %xmm0
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_f32_nan_inf:
+; X32:       # %bb.0:
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    andl $1, %eax
+; X32-NEXT:    negl %eax
+; X32-NEXT:    andl $4194304, %eax # imm = 0x400000
+; X32-NEXT:    xorl $2139095040, %eax # imm = 0x7F800000
+; X32-NEXT:    movl %eax, (%esp)
+; X32-NEXT:    flds (%esp)
+; X32-NEXT:    popl %eax
+; X32-NEXT:    .cfi_def_cfa_offset 4
+; X32-NEXT:    retl
+;
+; X32-NOCMOV-LABEL: test_ctselect_f32_nan_inf:
+; X32-NOCMOV:       # %bb.0:
+; X32-NOCMOV-NEXT:    pushl %eax
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    andl $1, %eax
+; X32-NOCMOV-NEXT:    negl %eax
+; X32-NOCMOV-NEXT:    andl $4194304, %eax # imm = 0x400000
+; X32-NOCMOV-NEXT:    xorl $2139095040, %eax # imm = 0x7F800000
+; X32-NOCMOV-NEXT:    movl %eax, (%esp)
+; X32-NOCMOV-NEXT:    flds (%esp)
+; X32-NOCMOV-NEXT:    popl %eax
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
+; X32-NOCMOV-NEXT:    retl
+  %result = call float @llvm.ct.select.f32(i1 %cond, float 0x7FF8000000000000, 
float 0x7FF0000000000000)
+  ret float %result
+}
+
+define double @test_ctselect_f64_nan_inf(i1 %cond) {
+; X64-LABEL: test_ctselect_f64_nan_inf:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    andl $1, %edi
+; X64-NEXT:    negq %rdi
+; X64-NEXT:    movabsq $2251799813685248, %rax # imm = 0x8000000000000
+; X64-NEXT:    andq %rdi, %rax
+; X64-NEXT:    movabsq $9218868437227405312, %rcx # imm = 0x7FF0000000000000
+; X64-NEXT:    xorq %rax, %rcx
+; X64-NEXT:    movq %rcx, %xmm0
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_f64_nan_inf:
+; X32:       # %bb.0:
+; X32-NEXT:    subl $12, %esp
+; X32-NEXT:    .cfi_def_cfa_offset 16
+; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    andl $1, %eax
+; X32-NEXT:    negl %eax
+; X32-NEXT:    andl $524288, %eax # imm = 0x80000
+; X32-NEXT:    orl $2146435072, %eax # imm = 0x7FF00000
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, (%esp)
+; X32-NEXT:    fldl (%esp)
+; X32-NEXT:    addl $12, %esp
+; X32-NEXT:    .cfi_def_cfa_offset 4
+; X32-NEXT:    retl
+;
+; X32-NOCMOV-LABEL: test_ctselect_f64_nan_inf:
+; X32-NOCMOV:       # %bb.0:
+; X32-NOCMOV-NEXT:    subl $12, %esp
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
+; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    andl $1, %eax
+; X32-NOCMOV-NEXT:    negl %eax
+; X32-NOCMOV-NEXT:    andl $524288, %eax # imm = 0x80000
+; X32-NOCMOV-NEXT:    orl $2146435072, %eax # imm = 0x7FF00000
+; X32-NOCMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT:    movl $0, (%esp)
+; X32-NOCMOV-NEXT:    fldl (%esp)
+; X32-NOCMOV-NEXT:    addl $12, %esp
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
+; X32-NOCMOV-NEXT:    retl
+  %result = call double @llvm.ct.select.f64(i1 %cond, double 
0x7FF8000000000000, double 0x7FF0000000000000)
+  ret double %result
+}
+
 ; Declare the intrinsics
+declare i1 @llvm.ct.select.i1(i1, i1, i1)
 declare i8 @llvm.ct.select.i8(i1, i8, i8)
 declare i16 @llvm.ct.select.i16(i1, i16, i16)
 declare i32 @llvm.ct.select.i32(i1, i32, i32)
@@ -777,3 +1310,12 @@ declare i64 @llvm.ct.select.i64(i1, i64, i64)
 declare float @llvm.ct.select.f32(i1, float, float)
 declare double @llvm.ct.select.f64(i1, double, double)
 declare ptr @llvm.ct.select.p0(i1, ptr, ptr)
+
+; Vector intrinsics
+declare <4 x i32> @llvm.ct.select.v4i32(i1, <4 x i32>, <4 x i32>)
+declare <2 x i64> @llvm.ct.select.v2i64(i1, <2 x i64>, <2 x i64>)
+declare <8 x i16> @llvm.ct.select.v8i16(i1, <8 x i16>, <8 x i16>)
+declare <16 x i8> @llvm.ct.select.v16i8(i1, <16 x i8>, <16 x i8>)
+declare <4 x float> @llvm.ct.select.v4f32(i1, <4 x float>, <4 x float>)
+declare <2 x double> @llvm.ct.select.v2f64(i1, <2 x double>, <2 x double>)
+declare <8 x i32> @llvm.ct.select.v8i32(i1, <8 x i32>, <8 x i32>)

_______________________________________________
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [llvm] [LLVM][ConstantTime] Strengthen constant-time handling of CTSELECT in DAG combine and legalization (PR #180883)

Reply via email to