[clang] [CIR] Upstream vec shuffle builtins in CIR codegen (PR #169178)

Thibault Monnier via cfe-commits Fri, 28 Nov 2025 22:47:48 -0800

https://github.com/Thibault-Monnier updated 
https://github.com/llvm/llvm-project/pull/169178


>From 8e108fb1f59fd91c412b075598fe06825a9d6f07 Mon Sep 17 00:00:00 2001
From: Thibault-Monnier <[email protected]>
Date: Tue, 25 Nov 2025 22:22:35 +0100
Subject: [PATCH 1/3] Upstream CIR Codegen for shuffle X86 builtins

---
 clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp    | 97 +++++++++++++++++--
 clang/lib/CIR/CodeGen/CIRGenFunction.h        | 24 ++++-
 clang/test/CIR/CodeGen/X86/avx-builtins.c     | 24 +++++
 clang/test/CIR/CodeGen/X86/avx2-builtins.c    | 53 ++++++++++
 .../test/CIR/CodeGen/X86/avx512bw-builtins.c  | 41 ++++++++
 clang/test/CIR/CodeGen/X86/avx512f-builtins.c | 24 +++++
 clang/test/CIR/CodeGen/X86/sse-builtins.c     | 12 +++
 clang/test/CIR/CodeGen/X86/sse2-builtins.c    | 43 +++++++-
 8 files changed, 308 insertions(+), 10 deletions(-)
 create mode 100644 clang/test/CIR/CodeGen/X86/avx2-builtins.c
 create mode 100644 clang/test/CIR/CodeGen/X86/avx512bw-builtins.c

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
index 978fee7dbec9d..98e61a5f5cb5f 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -68,6 +68,35 @@ static mlir::Value emitVectorFCmp(CIRGenBuilderTy &builder,
   return bitCast;
 }
 
+static cir::VecShuffleOp emitPshufW(CIRGenFunction &cgf,
+                                    CIRGenBuilderTy &builder,
+                                    llvm::SmallVector<mlir::Value> &ops,
+                                    const CallExpr *expr, const bool isLow) {
+  uint32_t imm = cgf.getZExtIntValueFromConstOp(ops[1]);
+
+  auto vecTy = cast<cir::VectorType>(ops[0].getType());
+  unsigned numElts = vecTy.getSize();
+
+  unsigned firstHalfStart = isLow ? 0 : 4;
+  unsigned secondHalfStart = 4 - firstHalfStart;
+
+  // Splat the 8-bits of immediate 4 times to help the loop wrap around.
+  imm = (imm & 0xff) * 0x01010101;
+
+  int64_t indices[32];
+  for (unsigned l = 0; l != numElts; l += 8) {
+    for (unsigned i = firstHalfStart; i != firstHalfStart + 4; ++i) {
+      indices[l + i] = l + (imm & 3) + firstHalfStart;
+      imm /= 4;
+    }
+    for (unsigned i = secondHalfStart; i != secondHalfStart + 4; ++i)
+      indices[l + i] = l + i;
+  }
+
+  return builder.createVecShuffle(cgf.getLoc(expr->getExprLoc()), ops[0],
+                                  ArrayRef(indices, numElts));
+}
+
 mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
                                                const CallExpr *expr) {
   if (builtinID == Builtin::BI__builtin_cpu_is) {
@@ -163,9 +192,7 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned 
builtinID,
   case X86::BI__builtin_ia32_vec_ext_v4di: {
     unsigned numElts = cast<cir::VectorType>(ops[0].getType()).getSize();
 
-    uint64_t index =
-        ops[1].getDefiningOp<cir::ConstantOp>().getIntValue().getZExtValue();
-
+    uint64_t index = getZExtIntValueFromConstOp(ops[1]);
     index &= numElts - 1;
 
     cir::ConstantOp indexVal =
@@ -523,12 +550,20 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned 
builtinID,
   case X86::BI__builtin_ia32_pblendw256:
   case X86::BI__builtin_ia32_pblendd128:
   case X86::BI__builtin_ia32_pblendd256:
+  cgm.errorNYI(expr->getSourceRange(),
+           std::string("unimplemented X86 builtin call: ") +
+               getContext().BuiltinInfo.getName(builtinID));
+  return {};
   case X86::BI__builtin_ia32_pshuflw:
   case X86::BI__builtin_ia32_pshuflw256:
-  case X86::BI__builtin_ia32_pshuflw512:
+  case X86::BI__builtin_ia32_pshuflw512: {
+    return emitPshufW(*this, builder, ops, expr, true);
+  }
   case X86::BI__builtin_ia32_pshufhw:
   case X86::BI__builtin_ia32_pshufhw256:
-  case X86::BI__builtin_ia32_pshufhw512:
+  case X86::BI__builtin_ia32_pshufhw512: {
+    return emitPshufW(*this, builder, ops, expr, false);
+  }
   case X86::BI__builtin_ia32_pshufd:
   case X86::BI__builtin_ia32_pshufd256:
   case X86::BI__builtin_ia32_pshufd512:
@@ -537,13 +572,61 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned 
builtinID,
   case X86::BI__builtin_ia32_vpermilpd256:
   case X86::BI__builtin_ia32_vpermilps256:
   case X86::BI__builtin_ia32_vpermilpd512:
-  case X86::BI__builtin_ia32_vpermilps512:
+  case X86::BI__builtin_ia32_vpermilps512: {
+    // TODO: Add tests for this branch.
+    uint32_t imm = getSExtIntValueFromConstOp(ops[1]);
+
+    auto vecTy = cast<cir::VectorType>(ops[0].getType());
+    unsigned numElts = vecTy.getSize();
+    auto eltTy = vecTy.getElementType();
+
+    unsigned eltBitWidth = getTypeSizeInBits(eltTy).getFixedValue();
+    unsigned numLaneElts = 128 / eltBitWidth;
+
+    // Splat the 8-bits of immediate 4 times to help the loop wrap around.
+    imm = (imm & 0xff) * 0x01010101;
+
+    llvm::SmallVector<int64_t, 16> indices;
+    for (unsigned l = 0; l != numElts; l += numLaneElts) {
+      for (unsigned i = 0; i != numLaneElts; ++i) {
+        indices.push_back((imm % numLaneElts) + l);
+        imm /= numLaneElts;
+      }
+    }
+
+    return builder.createVecShuffle(getLoc(expr->getExprLoc()), ops[0],
+                                    indices);
+  }
   case X86::BI__builtin_ia32_shufpd:
   case X86::BI__builtin_ia32_shufpd256:
   case X86::BI__builtin_ia32_shufpd512:
   case X86::BI__builtin_ia32_shufps:
   case X86::BI__builtin_ia32_shufps256:
-  case X86::BI__builtin_ia32_shufps512:
+  case X86::BI__builtin_ia32_shufps512: {
+    uint32_t imm = getZExtIntValueFromConstOp(ops[2]);
+
+    auto vecTy = cast<cir::VectorType>(ops[0].getType());
+    unsigned numElts = vecTy.getSize();
+    unsigned numLanes = cgm.getDataLayout().getTypeSizeInBits(vecTy) / 128;
+    unsigned numLaneElts = numElts / numLanes;
+
+    // Splat the 8-bits of immediate 4 times to help the loop wrap around.
+    imm = (imm & 0xff) * 0x01010101;
+
+    int64_t indices[16];
+    for (unsigned l = 0; l != numElts; l += numLaneElts) {
+      for (unsigned i = 0; i != numLaneElts; ++i) {
+        uint32_t idx = imm % numLaneElts;
+        imm /= numLaneElts;
+        if (i >= (numLaneElts / 2))
+          idx += numElts;
+        indices[l + i] = l + idx;
+      }
+    }
+
+    return builder.createVecShuffle(getLoc(expr->getExprLoc()), ops[0], ops[1],
+                                    ArrayRef(indices, numElts));
+  }
   case X86::BI__builtin_ia32_permdi256:
   case X86::BI__builtin_ia32_permdf256:
   case X86::BI__builtin_ia32_permdi512:
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h 
b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index b426f3389ff1b..70c030ba8e852 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -1349,6 +1349,28 @@ class CIRGenFunction : public CIRGenTypeCache {
                                     cir::IntType resType, mlir::Value emittedE,
                                     bool isDynamic);
 
+  /// Get integer from a mlir::Value that is an int constant or a constant op.
+  static int64_t getSExtIntValueFromConstOp(mlir::Value val) {
+    auto constOp = val.getDefiningOp<cir::ConstantOp>();
+    assert(constOp && "getIntValueFromConstOp call with non ConstantOp");
+    return constOp.getIntValue().getSExtValue();
+  }
+
+  /// Get zero-extended integer from a mlir::Value that is an int constant or a
+  /// constant op.
+  static int64_t getZExtIntValueFromConstOp(mlir::Value val) {
+    auto constOp = val.getDefiningOp<cir::ConstantOp>();
+    assert(constOp &&
+           "getZeroExtendedIntValueFromConstOp call with non ConstantOp");
+    return constOp.getIntValue().getZExtValue();
+  }
+
+  /// Get size of type in bits using SizedTypeInterface
+  llvm::TypeSize getTypeSizeInBits(mlir::Type ty) const {
+    assert(cir::isSized(ty) && "Type must implement SizedTypeInterface");
+    return cgm.getDataLayout().getTypeSizeInBits(ty);
+  }
+
   mlir::Value evaluateOrEmitBuiltinObjectSize(const clang::Expr *e,
                                               unsigned type,
                                               cir::IntType resType,
@@ -1804,7 +1826,7 @@ class CIRGenFunction : public CIRGenTypeCache {
 
   mlir::LogicalResult emitWhileStmt(const clang::WhileStmt &s);
 
-  mlir::Value emitX86BuiltinExpr(unsigned builtinID, const CallExpr *e);
+  mlir::Value emitX86BuiltinExpr(unsigned builtinID, const CallExpr *expr);
 
   /// Given an assignment `*lhs = rhs`, emit a test that checks if \p rhs is
   /// nonnull, if 1\p LHS is marked _Nonnull.
diff --git a/clang/test/CIR/CodeGen/X86/avx-builtins.c 
b/clang/test/CIR/CodeGen/X86/avx-builtins.c
index 82fa4358dc400..d9d1f3fc2b279 100644
--- a/clang/test/CIR/CodeGen/X86/avx-builtins.c
+++ b/clang/test/CIR/CodeGen/X86/avx-builtins.c
@@ -73,4 +73,28 @@ __m256i test_mm256_undefined_si256(void) {
   // OGCG-LABEL: test_mm256_undefined_si256
   // OGCG: ret <4 x i64> zeroinitializer
   return _mm256_undefined_si256();
+}
+
+__m256d test_mm256_shuffle_pd(__m256d A, __m256d B) {
+  // CIR-LABEL: test_mm256_shuffle_pd
+  // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x 
!cir.double>) [#cir.int<0> : !s32i, #cir.int<4> : !s32i, #cir.int<2> : !s32i, 
#cir.int<6> : !s32i] : !cir.vector<4 x !cir.double>
+
+  // CHECK-LABEL: test_mm256_shuffle_pd
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x 
i32> <i32 0, i32 4, i32 2, i32 6>
+
+  // OGCG-LABEL: test_mm256_shuffle_pd
+  // OGCG: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> 
<i32 0, i32 4, i32 2, i32 6>
+  return _mm256_shuffle_pd(A, B, 0);
+}
+
+__m256 test_mm256_shuffle_ps(__m256 A, __m256 B) {
+  // CIR-LABEL: test_mm256_shuffle_ps
+  // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x 
!cir.float>) [#cir.int<0> : !s32i, #cir.int<0> : !s32i, #cir.int<8> : !s32i, 
#cir.int<8> : !s32i, #cir.int<4> : !s32i, #cir.int<4> : !s32i, #cir.int<12> : 
!s32i, #cir.int<12> : !s32i] : !cir.vector<8 x !cir.float>
+
+  // CHECK-LABEL: test_mm256_shuffle_ps
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> 
<i32 0, i32 0, i32 8, i32 8, i32 4, i32 4, i32 12, i32 12>
+
+  // OGCG-LABEL: test_mm256_shuffle_ps
+  // OGCG: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> 
<i32 0, i32 0, i32 8, i32 8, i32 4, i32 4, i32 12, i32 12>
+  return _mm256_shuffle_ps(A, B, 0);
 }
\ No newline at end of file
diff --git a/clang/test/CIR/CodeGen/X86/avx2-builtins.c 
b/clang/test/CIR/CodeGen/X86/avx2-builtins.c
new file mode 100644
index 0000000000000..b7497c2053b2d
--- /dev/null
+++ b/clang/test/CIR/CodeGen/X86/avx2-builtins.c
@@ -0,0 +1,53 @@
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +avx2 -fclangir -emit-cir -o 
%t.cir -Wall -Werror
+// RUN: FileCheck --check-prefixes=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +avx2 -fno-signed-char -fclangir 
-emit-cir -o %t.cir -Wall -Werror
+// RUN: FileCheck --check-prefixes=CIR --input-file=%t.cir %s
+
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +avx2 -fclangir -emit-llvm -o 
%t.ll -Wall -Werror
+// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +avx2 -fno-signed-char -fclangir 
-emit-llvm -o %t.ll -Wall -Werror
+// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
+
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +avx2 -fclangir -emit-cir -o 
%t.cir -Wall -Werror
+// RUN: FileCheck --check-prefixes=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +avx2 -fno-signed-char -fclangir 
-emit-cir -o %t.cir -Wall -Werror
+// RUN: FileCheck --check-prefixes=CIR --input-file=%t.cir %s
+
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +avx2 -fclangir -emit-llvm -o 
%t.ll -Wall -Werror
+// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +avx2 -fno-signed-char -fclangir 
-emit-llvm -o %t.ll -Wall -Werror
+// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
+
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +avx2 -emit-llvm -o - -Wall -Werror 
| FileCheck %s --check-prefixes=OGCG
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +avx2 -fno-signed-char -emit-llvm 
-o - -Wall -Werror | FileCheck %s --check-prefixes=OGCG
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +avx2 -emit-llvm -o - -Wall -Werror 
| FileCheck %s --check-prefixes=OGCG
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +avx2 -fno-signed-char -emit-llvm 
-o - -Wall -Werror | FileCheck %s --check-prefixes=OGCG
+
+// This test mimics clang/test/CodeGen/X86/avx2-builtins.c, which eventually
+// CIR shall be able to support fully.
+
+#include <immintrin.h>
+
+__m256i test_mm256_shufflelo_epi16(__m256i a) {
+  // CIR-LABEL: _mm256_shufflelo_epi16
+  // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<16 x 
!s16i>) [#cir.int<3> : !s32i, #cir.int<0> : !s32i, #cir.int<1> : !s32i, 
#cir.int<1> : !s32i, #cir.int<4> : !s32i, #cir.int<5> : !s32i, #cir.int<6> : 
!s32i, #cir.int<7> : !s32i, #cir.int<11> : !s32i, #cir.int<8> : !s32i, 
#cir.int<9> : !s32i, #cir.int<9> : !s32i, #cir.int<12> : !s32i, #cir.int<13> : 
!s32i, #cir.int<14> : !s32i, #cir.int<15> : !s32i] : !cir.vector<16 x !s16i>
+
+  // LLVM-LABEL: test_mm256_shufflelo_epi16
+  // LLVM: shufflevector <16 x i16> %{{.*}}, <16 x i16> poison, <16 x i32> 
<i32 3, i32 0, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 9, 
i32 9, i32 12, i32 13, i32 14, i32 15>
+
+  // OGCG-LABEL: test_mm256_shufflelo_epi16
+  // OGCG: shufflevector <16 x i16> %{{.*}}, <16 x i16> poison, <16 x i32> 
<i32 3, i32 0, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 9, 
i32 9, i32 12, i32 13, i32 14, i32 15>
+  return _mm256_shufflelo_epi16(a, 83);
+}
+
+__m256i test_mm256_shufflehi_epi16(__m256i a) {
+  // CIR-LABEL: _mm256_shufflehi_epi16
+  // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<16 x 
!s16i>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, 
#cir.int<3> : !s32i, #cir.int<7> : !s32i, #cir.int<6> : !s32i, #cir.int<6> : 
!s32i, #cir.int<5> : !s32i, #cir.int<8> : !s32i, #cir.int<9> : !s32i, 
#cir.int<10> : !s32i, #cir.int<11> : !s32i, #cir.int<15> : !s32i, #cir.int<14> 
: !s32i, #cir.int<14> : !s32i, #cir.int<13> : !s32i] : !cir.vector<16 x !s16i>
+
+  // LLVM-LABEL: test_mm256_shufflehi_epi16
+  // LLVM: shufflevector <16 x i16> %{{.*}}, <16 x i16> poison, <16 x i32> 
<i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 6, i32 5, i32 8, i32 9, i32 10, 
i32 11, i32 15, i32 14, i32 14, i32 13>
+
+  // OGCG-LABEL: test_mm256_shufflehi_epi16
+  // OGCG: shufflevector <16 x i16> %{{.*}}, <16 x i16> poison, <16 x i32> 
<i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 6, i32 5, i32 8, i32 9, i32 10, 
i32 11, i32 15, i32 14, i32 14, i32 13>
+  return _mm256_shufflehi_epi16(a, 107);
+}
diff --git a/clang/test/CIR/CodeGen/X86/avx512bw-builtins.c 
b/clang/test/CIR/CodeGen/X86/avx512bw-builtins.c
new file mode 100644
index 0000000000000..db7a76da06ad7
--- /dev/null
+++ b/clang/test/CIR/CodeGen/X86/avx512bw-builtins.c
@@ -0,0 +1,41 @@
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +avx512bw -fclangir -emit-cir -o 
%t.cir -Wall -Werror -Wsign-conversion
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +avx512bw -fno-signed-char  
-fclangir -emit-cir -o %t.cir -Wall -Werror -Wsign-conversion
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +avx512bw  -fclangir -emit-llvm -o 
%t.ll -Wall -Werror -Wsign-conversion
+// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux  -target-feature +avx512bw -fno-signed-char  
-fclangir -emit-llvm -o %t.ll -Wall -Werror -Wsign-conversion
+// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
+
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +avx512bw -emit-llvm -o - -Wall 
-Werror -Wsign-conversion | FileCheck %s --check-prefix=OGCG
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +avx512bw -fno-signed-char 
-emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s 
--check-prefix=OGCG
+
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +avx512bw -emit-llvm -o - -Wall 
-Werror -Wsign-conversion | FileCheck %s --check-prefixes=OGCG
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +avx512bw -fno-signed-char 
-emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s 
--check-prefixes=OGCG
+
+#include <immintrin.h>
+
+__m512i test_mm512_shufflelo_epi16(__m512i __A) {
+  // CIR-LABEL: _mm512_shufflelo_epi16
+  // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<32 x 
!s16i>) [#cir.int<1> : !s32i, #cir.int<1> : !s32i, #cir.int<0> : !s32i, 
#cir.int<0> : !s32i, #cir.int<4> : !s32i, #cir.int<5> : !s32i, #cir.int<6> : 
!s32i, #cir.int<7> : !s32i, #cir.int<9> : !s32i, #cir.int<9> : !s32i, 
#cir.int<8> : !s32i, #cir.int<8> : !s32i, #cir.int<12> : !s32i, #cir.int<13> : 
!s32i, #cir.int<14> : !s32i, #cir.int<15> : !s32i, #cir.int<17> : !s32i, 
#cir.int<17> : !s32i, #cir.int<16> : !s32i, #cir.int<16> : !s32i, #cir.int<20> 
: !s32i, #cir.int<21> : !s32i, #cir.int<22> : !s32i, #cir.int<23> : !s32i, 
#cir.int<25> : !s32i, #cir.int<25> : !s32i, #cir.int<24> : !s32i, #cir.int<24> 
: !s32i, #cir.int<28> : !s32i, #cir.int<29> : !s32i, #cir.int<30> : !s32i, 
#cir.int<31> : !s32i] : !cir.vector<32 x !s16i>
+
+  // LLVM-LABEL: @test_mm512_shufflelo_epi16
+  // LLVM: shufflevector <32 x i16> %{{.*}}, <32 x i16> poison, <32 x i32> 
<i32 1, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 9, i32 9, i32 8, 
i32 8, i32 12, i32 13, i32 14, i32 15, i32 17, i32 17, i32 16, i32 16, i32 20, 
i32 21, i32 22, i32 23, i32 25, i32 25, i32 24, i32 24, i32 28, i32 29, i32 30, 
i32 31>
+
+  // OGCG-LABEL: @test_mm512_shufflelo_epi16
+  // OGCG: shufflevector <32 x i16> %{{.*}}, <32 x i16> poison, <32 x i32> 
<i32 1, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 9, i32 9, i32 8, 
i32 8, i32 12, i32 13, i32 14, i32 15, i32 17, i32 17, i32 16, i32 16, i32 20, 
i32 21, i32 22, i32 23, i32 25, i32 25, i32 24, i32 24, i32 28, i32 29, i32 30, 
i32 31>
+  return _mm512_shufflelo_epi16(__A, 5);
+}
+
+__m512i test_mm512_shufflehi_epi16(__m512i __A) {
+  // CIR-LABEL: _mm512_shufflehi_epi16
+  // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<32 x 
!s16i>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, 
#cir.int<3> : !s32i, #cir.int<5> : !s32i, #cir.int<5> : !s32i, #cir.int<4> : 
!s32i, #cir.int<4> : !s32i, #cir.int<8> : !s32i, #cir.int<9> : !s32i, 
#cir.int<10> : !s32i, #cir.int<11> : !s32i, #cir.int<13> : !s32i, #cir.int<13> 
: !s32i, #cir.int<12> : !s32i, #cir.int<12> : !s32i, #cir.int<16> : !s32i, 
#cir.int<17> : !s32i, #cir.int<18> : !s32i, #cir.int<19> : !s32i, #cir.int<21> 
: !s32i, #cir.int<21> : !s32i, #cir.int<20> : !s32i, #cir.int<20> : !s32i, 
#cir.int<24> : !s32i, #cir.int<25> : !s32i, #cir.int<26> : !s32i, #cir.int<27> 
: !s32i, #cir.int<29> : !s32i, #cir.int<29> : !s32i, #cir.int<28> : !s32i, 
#cir.int<28> : !s32i] : !cir.vector<32 x !s16i>
+
+  // LLVM-LABEL: @test_mm512_shufflehi_epi16
+  // LLVM: shufflevector <32 x i16> %{{.*}}, <32 x i16> poison, <32 x i32> 
<i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 4, i32 4, i32 8, i32 9, i32 10, 
i32 11, i32 13, i32 13, i32 12, i32 12, i32 16, i32 17, i32 18, i32 19, i32 21, 
i32 21, i32 20, i32 20, i32 24, i32 25, i32 26, i32 27, i32 29, i32 29, i32 28, 
i32 28>
+
+  // OGCG-LABEL: @test_mm512_shufflehi_epi16
+  // OGCG: shufflevector <32 x i16> %{{.*}}, <32 x i16> poison, <32 x i32> 
<i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 4, i32 4, i32 8, i32 9, i32 10, 
i32 11, i32 13, i32 13, i32 12, i32 12, i32 16, i32 17, i32 18, i32 19, i32 21, 
i32 21, i32 20, i32 20, i32 24, i32 25, i32 26, i32 27, i32 29, i32 29, i32 28, 
i32 28>
+  return _mm512_shufflehi_epi16(__A, 5);
+}
diff --git a/clang/test/CIR/CodeGen/X86/avx512f-builtins.c 
b/clang/test/CIR/CodeGen/X86/avx512f-builtins.c
index dc54a87856a7c..bac01671155f5 100644
--- a/clang/test/CIR/CodeGen/X86/avx512f-builtins.c
+++ b/clang/test/CIR/CodeGen/X86/avx512f-builtins.c
@@ -77,3 +77,27 @@ __m512i test_mm512_undefined_epi32(void) {
   // OGCG: ret <8 x i64> zeroinitializer
   return _mm512_undefined_epi32();
 }
+
+__m512d test_mm512_shuffle_pd(__m512d __M, __m512d __V) {
+  // CIR-LABEL: test_mm512_shuffle_pd
+  // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x 
!cir.double>) [#cir.int<0> : !s32i, #cir.int<8> : !s32i, #cir.int<3> : !s32i, 
#cir.int<10> : !s32i, #cir.int<4> : !s32i, #cir.int<12> : !s32i, #cir.int<6> : 
!s32i, #cir.int<14> : !s32i] : !cir.vector<8 x !cir.double>
+
+  // LLVM-LABEL: test_mm512_shuffle_pd
+  // LLVM: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> 
<i32 0, i32 8, i32 3, i32 10, i32 4, i32 12, i32 6, i32 14>
+
+  // OGCG-LABEL: test_mm512_shuffle_pd
+  // OGCG: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> 
<i32 0, i32 8, i32 3, i32 10, i32 4, i32 12, i32 6, i32 14>
+  return _mm512_shuffle_pd(__M, __V, 4);
+}
+
+__m512 test_mm512_shuffle_ps(__m512 __M, __m512 __V) {
+  // CIR-LABEL: test_mm512_shuffle_ps
+  // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<16 x 
!cir.float>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<16> : !s32i, 
#cir.int<16> : !s32i, #cir.int<4> : !s32i, #cir.int<5> : !s32i, #cir.int<20> : 
!s32i, #cir.int<20> : !s32i, #cir.int<8> : !s32i, #cir.int<9> : !s32i, 
#cir.int<24> : !s32i, #cir.int<24> : !s32i, #cir.int<12> : !s32i, #cir.int<13> 
: !s32i, #cir.int<28> : !s32i, #cir.int<28> : !s32i] : !cir.vector<16 x 
!cir.float>
+
+  // LLVM-LABEL: test_mm512_shuffle_ps
+  // LLVM: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x 
i32> <i32 0, i32 1, i32 16, i32 16, i32 4, i32 5, i32 20, i32 20, i32 8, i32 9, 
i32 24, i32 24, i32 12, i32 13, i32 28, i32 28>
+
+  // OGCG-LABEL: test_mm512_shuffle_ps
+  // OGCG: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x 
i32> <i32 0, i32 1, i32 16, i32 16, i32 4, i32 5, i32 20, i32 20, i32 8, i32 9, 
i32 24, i32 24, i32 12, i32 13, i32 28, i32 28>
+  return _mm512_shuffle_ps(__M, __V, 4);
+}
diff --git a/clang/test/CIR/CodeGen/X86/sse-builtins.c 
b/clang/test/CIR/CodeGen/X86/sse-builtins.c
index c893859b297cc..a2a5b1849d727 100644
--- a/clang/test/CIR/CodeGen/X86/sse-builtins.c
+++ b/clang/test/CIR/CodeGen/X86/sse-builtins.c
@@ -71,3 +71,15 @@ __m128 test_mm_undefined_ps(void) {
   // OGCG: ret <4 x float> zeroinitializer
   return _mm_undefined_ps();
 }
+
+__m128 test_mm_shuffle_ps(__m128 A, __m128 B) {
+  // CIR-LABEL: _mm_shuffle_ps
+  // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x 
!cir.float>) [#cir.int<0> : !s32i, #cir.int<0> : !s32i, #cir.int<4> : !s32i, 
#cir.int<4> : !s32i] : !cir.vector<4 x !cir.float>
+
+  // CHECK-LABEL: test_mm_shuffle_ps
+  // CHECK: shufflevector <4 x float> {{.*}}, <4 x float> {{.*}}, <4 x i32> 
<i32 0, i32 0, i32 4, i32 4>
+
+  // OGCG-LABEL: test_mm_shuffle_ps
+  // OGCG: shufflevector <4 x float> {{.*}}, <4 x float> {{.*}}, <4 x i32> 
<i32 0, i32 0, i32 4, i32 4>
+  return _mm_shuffle_ps(A, B, 0);
+}
diff --git a/clang/test/CIR/CodeGen/X86/sse2-builtins.c 
b/clang/test/CIR/CodeGen/X86/sse2-builtins.c
index f5e07cdc28ccd..31a297bd3cb52 100644
--- a/clang/test/CIR/CodeGen/X86/sse2-builtins.c
+++ b/clang/test/CIR/CodeGen/X86/sse2-builtins.c
@@ -8,8 +8,11 @@
 // RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +sse2 -fno-signed-char -fclangir 
-emit-llvm -o %t.ll -Wall -Werror
 // RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
 
-// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +sse -emit-llvm -o - -Wall -Werror 
| FileCheck %s -check-prefix=OGCG
-// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +sse -emit-llvm -o - -Wall -Werror 
| FileCheck %s -check-prefix=OGCG
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +sse2 -emit-llvm -o - -Wall -Werror 
| FileCheck %s --check-prefixes=OGCG
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +sse2 -fno-signed-char -emit-llvm 
-o - -Wall -Werror | FileCheck %s --check-prefixes=OGCG
+
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +sse2 -emit-llvm -o - -Wall -Werror 
| FileCheck %s --check-prefixes=OGCG
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +sse2 -fno-signed-char -emit-llvm 
-o - -Wall -Werror | FileCheck %s --check-prefixes=OGCG
 
 // This test mimics clang/test/CodeGen/X86/sse2-builtins.c, which eventually
 // CIR shall be able to support fully.
@@ -108,3 +111,39 @@ void test_mm_pause(void) {
   // LLVM: call void @llvm.x86.sse2.pause()
   // OGCG: call void @llvm.x86.sse2.pause()
 }
+
+__m128i test_mm_shufflelo_epi16(__m128i A) {
+  // CIR-LABEL: _mm_shufflelo_epi16
+  // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !s16i>) 
[#cir.int<0> : !s32i, #cir.int<0> : !s32i, #cir.int<0> : !s32i, #cir.int<0> : 
!s32i, #cir.int<4> : !s32i, #cir.int<5> : !s32i, #cir.int<6> : !s32i, 
#cir.int<7> : !s32i] : !cir.vector<8 x !s16i>
+
+  // LLVM-LABEL: test_mm_shufflelo_epi16
+  // LLVM: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> <i32 
0, i32 0, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
+
+  // OGCG-LABEL: test_mm_shufflelo_epi16
+  // OGCG: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> <i32 
0, i32 0, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
+  return _mm_shufflelo_epi16(A, 0);
+}
+
+__m128i test_mm_shufflehi_epi16(__m128i A) {
+  // CIR-LABEL: _mm_shufflehi_epi16
+  // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !s16i>) 
[#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : 
!s32i, #cir.int<4> : !s32i, #cir.int<4> : !s32i, #cir.int<4> : !s32i, 
#cir.int<4> : !s32i] : !cir.vector<8 x !s16i>
+
+  // LLVM-LABEL: test_mm_shufflehi_epi16
+  // LLVM: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> <i32 
0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
+
+  // OGCG-LABEL: test_mm_shufflehi_epi16
+  // OGCG: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> <i32 
0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
+  return _mm_shufflehi_epi16(A, 0);
+}
+
+__m128d test_mm_shuffle_pd(__m128d A, __m128d B) {
+  // CIR-LABEL: test_mm_shuffle_pd
+  // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x 
!cir.double>) [#cir.int<1> : !s32i, #cir.int<2> : !s32i] : !cir.vector<2 x 
!cir.double>
+
+  // CHECK-LABEL: test_mm_shuffle_pd
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x 
i32> <i32 1, i32 2>
+
+  // OGCG-LABEL: test_mm_shuffle_pd
+  // OGCG: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> 
<i32 1, i32 2>
+  return _mm_shuffle_pd(A, B, 1);
+}
\ No newline at end of file

>From d552a3b832b1076df33523e3c7a94abd01caa4c7 Mon Sep 17 00:00:00 2001
From: Thibault-Monnier <[email protected]>
Date: Tue, 25 Nov 2025 22:29:13 +0100
Subject: [PATCH 2/3] Address reviews + upstream tests for pshufd

---
 clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp    | 100 +++++++---------
 clang/lib/CIR/CodeGen/CIRGenFunction.h        |  38 +++---
 clang/test/CIR/CodeGen/X86/avx-builtins.c     |   2 +-
 .../CIR/CodeGen/X86/builtin-x86-pshufd.cpp    | 113 ++++++++++++++++++
 clang/test/CIR/CodeGen/X86/sse2-builtins.c    |   2 +-
 5 files changed, 177 insertions(+), 78 deletions(-)
 create mode 100644 clang/test/CIR/CodeGen/X86/builtin-x86-pshufd.cpp

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
index 98e61a5f5cb5f..e50957809e85d 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -68,13 +68,15 @@ static mlir::Value emitVectorFCmp(CIRGenBuilderTy &builder,
   return bitCast;
 }
 
+//
 static cir::VecShuffleOp emitPshufW(CIRGenFunction &cgf,
                                     CIRGenBuilderTy &builder,
-                                    llvm::SmallVector<mlir::Value> &ops,
+                                    const mlir::Value vec,
+                                    const mlir::Value immediate,
                                     const CallExpr *expr, const bool isLow) {
-  uint32_t imm = cgf.getZExtIntValueFromConstOp(ops[1]);
+  uint32_t imm = cgf.getZExtIntValueFromConstOp(immediate);
 
-  auto vecTy = cast<cir::VectorType>(ops[0].getType());
+  auto vecTy = cast<cir::VectorType>(vec.getType());
   unsigned numElts = vecTy.getSize();
 
   unsigned firstHalfStart = isLow ? 0 : 4;
@@ -87,16 +89,41 @@ static cir::VecShuffleOp emitPshufW(CIRGenFunction &cgf,
   for (unsigned l = 0; l != numElts; l += 8) {
     for (unsigned i = firstHalfStart; i != firstHalfStart + 4; ++i) {
       indices[l + i] = l + (imm & 3) + firstHalfStart;
-      imm /= 4;
+      imm >>= 2;
     }
     for (unsigned i = secondHalfStart; i != secondHalfStart + 4; ++i)
       indices[l + i] = l + i;
   }
 
-  return builder.createVecShuffle(cgf.getLoc(expr->getExprLoc()), ops[0],
+  return builder.createVecShuffle(cgf.getLoc(expr->getExprLoc()), vec,
                                   ArrayRef(indices, numElts));
 }
 
+static llvm::SmallVector<int64_t, 16>
+computeMaskPshufDOrShufP(CIRGenFunction &cgf, const mlir::Value vec,
+                         uint32_t imm, const bool isShufP) {
+  auto vecTy = cast<cir::VectorType>(vec.getType());
+  unsigned numElts = vecTy.getSize();
+  unsigned numLanes = cgf.cgm.getDataLayout().getTypeSizeInBits(vecTy) / 128;
+  unsigned numLaneElts = numElts / numLanes;
+
+  // Splat the 8-bits of immediate 4 times to help the loop wrap around.
+  imm = (imm & 0xff) * 0x01010101;
+
+  llvm::SmallVector<int64_t, 16> indices(numElts);
+  for (unsigned l = 0; l != numElts; l += numLaneElts) {
+    for (unsigned i = 0; i != numLaneElts; ++i) {
+      uint32_t idx = imm % numLaneElts;
+      imm /= numLaneElts;
+      if (isShufP && i >= (numLaneElts / 2))
+        idx += numElts;
+      indices[l + i] = l + idx;
+    }
+  }
+
+  return indices;
+}
+
 mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
                                                const CallExpr *expr) {
   if (builtinID == Builtin::BI__builtin_cpu_is) {
@@ -550,19 +577,19 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned 
builtinID,
   case X86::BI__builtin_ia32_pblendw256:
   case X86::BI__builtin_ia32_pblendd128:
   case X86::BI__builtin_ia32_pblendd256:
-  cgm.errorNYI(expr->getSourceRange(),
-           std::string("unimplemented X86 builtin call: ") +
-               getContext().BuiltinInfo.getName(builtinID));
-  return {};
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented X86 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
   case X86::BI__builtin_ia32_pshuflw:
   case X86::BI__builtin_ia32_pshuflw256:
   case X86::BI__builtin_ia32_pshuflw512: {
-    return emitPshufW(*this, builder, ops, expr, true);
+    return emitPshufW(*this, builder, ops[0], ops[1], expr, true);
   }
   case X86::BI__builtin_ia32_pshufhw:
   case X86::BI__builtin_ia32_pshufhw256:
   case X86::BI__builtin_ia32_pshufhw512: {
-    return emitPshufW(*this, builder, ops, expr, false);
+    return emitPshufW(*this, builder, ops[0], ops[1], expr, false);
   }
   case X86::BI__builtin_ia32_pshufd:
   case X86::BI__builtin_ia32_pshufd256:
@@ -573,29 +600,11 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned 
builtinID,
   case X86::BI__builtin_ia32_vpermilps256:
   case X86::BI__builtin_ia32_vpermilpd512:
   case X86::BI__builtin_ia32_vpermilps512: {
-    // TODO: Add tests for this branch.
-    uint32_t imm = getSExtIntValueFromConstOp(ops[1]);
-
-    auto vecTy = cast<cir::VectorType>(ops[0].getType());
-    unsigned numElts = vecTy.getSize();
-    auto eltTy = vecTy.getElementType();
-
-    unsigned eltBitWidth = getTypeSizeInBits(eltTy).getFixedValue();
-    unsigned numLaneElts = 128 / eltBitWidth;
-
-    // Splat the 8-bits of immediate 4 times to help the loop wrap around.
-    imm = (imm & 0xff) * 0x01010101;
+    const uint32_t imm = getSExtIntValueFromConstOp(ops[1]);
+    const llvm::SmallVector<int64_t, 16> mask =
+        computeMaskPshufDOrShufP(*this, ops[0], imm, false);
 
-    llvm::SmallVector<int64_t, 16> indices;
-    for (unsigned l = 0; l != numElts; l += numLaneElts) {
-      for (unsigned i = 0; i != numLaneElts; ++i) {
-        indices.push_back((imm % numLaneElts) + l);
-        imm /= numLaneElts;
-      }
-    }
-
-    return builder.createVecShuffle(getLoc(expr->getExprLoc()), ops[0],
-                                    indices);
+    return builder.createVecShuffle(getLoc(expr->getExprLoc()), ops[0], mask);
   }
   case X86::BI__builtin_ia32_shufpd:
   case X86::BI__builtin_ia32_shufpd256:
@@ -603,29 +612,12 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned 
builtinID,
   case X86::BI__builtin_ia32_shufps:
   case X86::BI__builtin_ia32_shufps256:
   case X86::BI__builtin_ia32_shufps512: {
-    uint32_t imm = getZExtIntValueFromConstOp(ops[2]);
-
-    auto vecTy = cast<cir::VectorType>(ops[0].getType());
-    unsigned numElts = vecTy.getSize();
-    unsigned numLanes = cgm.getDataLayout().getTypeSizeInBits(vecTy) / 128;
-    unsigned numLaneElts = numElts / numLanes;
-
-    // Splat the 8-bits of immediate 4 times to help the loop wrap around.
-    imm = (imm & 0xff) * 0x01010101;
-
-    int64_t indices[16];
-    for (unsigned l = 0; l != numElts; l += numLaneElts) {
-      for (unsigned i = 0; i != numLaneElts; ++i) {
-        uint32_t idx = imm % numLaneElts;
-        imm /= numLaneElts;
-        if (i >= (numLaneElts / 2))
-          idx += numElts;
-        indices[l + i] = l + idx;
-      }
-    }
+    const uint32_t imm = getZExtIntValueFromConstOp(ops[2]);
+    const llvm::SmallVector<int64_t, 16> mask =
+        computeMaskPshufDOrShufP(*this, ops[0], imm, true);
 
     return builder.createVecShuffle(getLoc(expr->getExprLoc()), ops[0], ops[1],
-                                    ArrayRef(indices, numElts));
+                                    mask);
   }
   case X86::BI__builtin_ia32_permdi256:
   case X86::BI__builtin_ia32_permdf256:
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h 
b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index 70c030ba8e852..23a4d29dc99d8 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -202,6 +202,22 @@ class CIRGenFunction : public CIRGenTypeCache {
     return convertType(getContext().getTypeDeclType(t));
   }
 
+  /// Get integer from a mlir::Value that is an int constant or a constant op.
+  static int64_t getSExtIntValueFromConstOp(mlir::Value val) {
+    auto constOp = val.getDefiningOp<cir::ConstantOp>();
+    assert(constOp && "getIntValueFromConstOp call with non ConstantOp");
+    return constOp.getIntValue().getSExtValue();
+  }
+
+  /// Get zero-extended integer from a mlir::Value that is an int constant or a
+  /// constant op.
+  static int64_t getZExtIntValueFromConstOp(mlir::Value val) {
+    auto constOp = val.getDefiningOp<cir::ConstantOp>();
+    assert(constOp &&
+           "getZeroExtendedIntValueFromConstOp call with non ConstantOp");
+    return constOp.getIntValue().getZExtValue();
+  }
+
   ///  Return the cir::TypeEvaluationKind of QualType \c type.
   static cir::TypeEvaluationKind getEvaluationKind(clang::QualType type);
 
@@ -1349,28 +1365,6 @@ class CIRGenFunction : public CIRGenTypeCache {
                                     cir::IntType resType, mlir::Value emittedE,
                                     bool isDynamic);
 
-  /// Get integer from a mlir::Value that is an int constant or a constant op.
-  static int64_t getSExtIntValueFromConstOp(mlir::Value val) {
-    auto constOp = val.getDefiningOp<cir::ConstantOp>();
-    assert(constOp && "getIntValueFromConstOp call with non ConstantOp");
-    return constOp.getIntValue().getSExtValue();
-  }
-
-  /// Get zero-extended integer from a mlir::Value that is an int constant or a
-  /// constant op.
-  static int64_t getZExtIntValueFromConstOp(mlir::Value val) {
-    auto constOp = val.getDefiningOp<cir::ConstantOp>();
-    assert(constOp &&
-           "getZeroExtendedIntValueFromConstOp call with non ConstantOp");
-    return constOp.getIntValue().getZExtValue();
-  }
-
-  /// Get size of type in bits using SizedTypeInterface
-  llvm::TypeSize getTypeSizeInBits(mlir::Type ty) const {
-    assert(cir::isSized(ty) && "Type must implement SizedTypeInterface");
-    return cgm.getDataLayout().getTypeSizeInBits(ty);
-  }
-
   mlir::Value evaluateOrEmitBuiltinObjectSize(const clang::Expr *e,
                                               unsigned type,
                                               cir::IntType resType,
diff --git a/clang/test/CIR/CodeGen/X86/avx-builtins.c 
b/clang/test/CIR/CodeGen/X86/avx-builtins.c
index d9d1f3fc2b279..1a589b99e20f5 100644
--- a/clang/test/CIR/CodeGen/X86/avx-builtins.c
+++ b/clang/test/CIR/CodeGen/X86/avx-builtins.c
@@ -97,4 +97,4 @@ __m256 test_mm256_shuffle_ps(__m256 A, __m256 B) {
   // OGCG-LABEL: test_mm256_shuffle_ps
   // OGCG: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> 
<i32 0, i32 0, i32 8, i32 8, i32 4, i32 4, i32 12, i32 12>
   return _mm256_shuffle_ps(A, B, 0);
-}
\ No newline at end of file
+}
diff --git a/clang/test/CIR/CodeGen/X86/builtin-x86-pshufd.cpp 
b/clang/test/CIR/CodeGen/X86/builtin-x86-pshufd.cpp
new file mode 100644
index 0000000000000..29b71f7877575
--- /dev/null
+++ b/clang/test/CIR/CodeGen/X86/builtin-x86-pshufd.cpp
@@ -0,0 +1,113 @@
+// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-linux-gnu -fclangir 
-emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s
+
+// Test that __builtin_ia32_pshufd and __builtin_ia32_vpermilp generates 
correct CIR vec.shuffle operations
+// This verifies the fix for SIMD intrinsic support that was previously NYI
+
+typedef int __v4si __attribute__((__vector_size__(16)));
+typedef float __v4sf __attribute__((__vector_size__(16)));
+typedef double __v2df __attribute__((__vector_size__(16)));
+typedef float __v8sf __attribute__((__vector_size__(32)));
+typedef double __v4df __attribute__((__vector_size__(32)));
+typedef float __v16sf __attribute__((__vector_size__(64)));
+typedef double __v8df __attribute__((__vector_size__(64)));
+
+typedef __v4si __m128i;
+typedef __v4sf __m128;
+typedef __v2df __m128d;
+typedef __v8sf __m256;
+typedef __v4df __m256d;
+typedef __v16sf __m512;
+typedef __v8df __m512d;
+
+// CHECK-LABEL: @_Z11test_pshufdv
+void test_pshufd() {
+    __m128i vec = {1, 2, 3, 4};
+    // CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !s32i>) 
[#cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<0> : !s32i, #cir.int<1> : 
!s32i] : !cir.vector<4 x !s32i>
+    __m128i result = __builtin_ia32_pshufd(vec, 0x4E);
+}
+
+// CHECK-LABEL: @_Z19test_different_maskv
+void test_different_mask() {
+    __m128i vec = {10, 20, 30, 40};
+    // Test different immediate value: 0x1B = 00011011 = [3,2,1,0] reversed
+    // CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !s32i>) 
[#cir.int<3> : !s32i, #cir.int<2> : !s32i, #cir.int<1> : !s32i, #cir.int<0> : 
!s32i] : !cir.vector<4 x !s32i>
+    __m128i result = __builtin_ia32_pshufd(vec, 0x1B);
+}
+
+// CHECK-LABEL: @_Z9test_casev
+void test_case() {
+    __m128i p0 = {1, 2, 3, 4};
+
+    // This reproduces the exact pattern from stb_image.h:2685 that was 
failing:
+    // _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p0, 0x4e));
+    // Which expands to: __builtin_ia32_pshufd(p0, 0x4e)
+    // CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !s32i>) 
[#cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<0> : !s32i, #cir.int<1> : 
!s32i] : !cir.vector<4 x !s32i>
+    __m128i out_vec = __builtin_ia32_pshufd(p0, 0x4e);
+}
+
+// CHECK-LABEL: @_Z15test_vpermilps4v
+void test_vpermilps4() {
+    __m128 vec = {1.0f, 2.0f, 3.0f, 4.0f};
+    // vpermilps with immediate 0x4E = 01001110 = [1,3,2,0] for 4 elements
+    // CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} :  !cir.vector<4 x !cir.float>) 
[#cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<0> : !s32i, #cir.int<1> : 
!s32i] : !cir.vector<4 x !cir.float>
+    __m128 result = __builtin_ia32_vpermilps(vec, 0x4E);
+}
+
+// CHECK-LABEL: @_Z15test_vpermilpd2v
+void test_vpermilpd2() {
+    __m128d vec = {1.0, 2.0};
+    // vpermilpd with immediate 0x1 = 01 = [1,0] for 2 elements
+    // CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !cir.double>) 
[#cir.int<1> : !s32i, #cir.int<0> : !s32i] : !cir.vector<2 x !cir.double>
+    __m128d result = __builtin_ia32_vpermilpd(vec, 0x1);
+}
+
+// CHECK-LABEL: @_Z17test_vpermilps256v
+void test_vpermilps256() {
+    __m256 vec = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
+    // vpermilps256 with immediate 0x1B = 00011011 = [3,2,1,0] for each 
128-bit lane
+    // CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !cir.float>) 
[#cir.int<3> : !s32i, #cir.int<2> : !s32i, #cir.int<1> : !s32i, #cir.int<0> : 
!s32i, #cir.int<7> : !s32i, #cir.int<6> : !s32i, #cir.int<5> : !s32i, 
#cir.int<4> : !s32i] : !cir.vector<8 x !cir.float>
+    __m256 result = __builtin_ia32_vpermilps256(vec, 0x1B);
+}
+
+// CHECK-LABEL: @_Z17test_vpermilpd256v
+void test_vpermilpd256() {
+    __m256d vec = {1.0, 2.0, 3.0, 4.0};
+    // vpermilpd256 with immediate 0x5 = 0101 = [1,0,1,0] for 4 elements
+    // CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !cir.double>) 
[#cir.int<1> : !s32i, #cir.int<0> : !s32i, #cir.int<3> : !s32i, #cir.int<2> : 
!s32i] : !cir.vector<4 x !cir.double>
+    __m256d result = __builtin_ia32_vpermilpd256(vec, 0x5);
+}
+
+// CHECK-LABEL: @_Z17test_vpermilps512v
+void test_vpermilps512() {
+    __m512 vec = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f,
+                  9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f};
+    // vpermilps512 with immediate 0x4E = 01001110 = [1,3,2,0] for each 
128-bit lane
+    // CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<16 x !cir.float>) 
[#cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<0> : !s32i, #cir.int<1> : 
!s32i, #cir.int<6> : !s32i, #cir.int<7> : !s32i, #cir.int<4> : !s32i, 
#cir.int<5> : !s32i, #cir.int<10> : !s32i, #cir.int<11> : !s32i, #cir.int<8> : 
!s32i, #cir.int<9> : !s32i, #cir.int<14> : !s32i, #cir.int<15> : !s32i, 
#cir.int<12> : !s32i, #cir.int<13> : !s32i] : !cir.vector<16 x !cir.float>
+    __m512 result = __builtin_ia32_vpermilps512(vec, 0x4E);
+}
+
+// CHECK-LABEL: @_Z17test_vpermilpd512v
+void test_vpermilpd512() {
+    __m512d vec = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0};
+    // vpermilpd512 with immediate 0x55 = 01010101 = [1,0,1,0,1,0,1,0] for 8 
elements
+    // CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !cir.double>) 
[#cir.int<1> : !s32i, #cir.int<0> : !s32i, #cir.int<3> : !s32i, #cir.int<2> : 
!s32i, #cir.int<5> : !s32i, #cir.int<4> : !s32i, #cir.int<7> : !s32i, 
#cir.int<6> : !s32i] : !cir.vector<8 x !cir.double>
+    __m512d result = __builtin_ia32_vpermilpd512(vec, 0x55);
+}
+
+// Test different immediate values
+// CHECK-LABEL: @_Z24test_vpermilps_differentv
+void test_vpermilps_different() {
+    __m128 vec = {10.0f, 20.0f, 30.0f, 40.0f};
+    // Test different immediate value: 0x1B = 00011011 = [3,2,1,0] reversed
+    // CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !cir.float>) 
[#cir.int<3> : !s32i, #cir.int<2> : !s32i, #cir.int<1> : !s32i, #cir.int<0> : 
!s32i] : !cir.vector<4 x !cir.float>
+    __m128 result = __builtin_ia32_vpermilps(vec, 0x1B);
+}
+
+// CHECK-LABEL: @_Z24test_vpermilpd_differentv
+void test_vpermilpd_different() {
+    __m128d vec = {100.0, 200.0};
+    // Test immediate 0x0 = 00 = [0,0] - duplicate first element
+    // CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !cir.double>) 
[#cir.int<0> : !s32i, #cir.int<0> : !s32i] : !cir.vector<2 x !cir.double>
+    __m128d result = __builtin_ia32_vpermilpd(vec, 0x0);
+}
diff --git a/clang/test/CIR/CodeGen/X86/sse2-builtins.c 
b/clang/test/CIR/CodeGen/X86/sse2-builtins.c
index 31a297bd3cb52..a205600c8c1b5 100644
--- a/clang/test/CIR/CodeGen/X86/sse2-builtins.c
+++ b/clang/test/CIR/CodeGen/X86/sse2-builtins.c
@@ -146,4 +146,4 @@ __m128d test_mm_shuffle_pd(__m128d A, __m128d B) {
   // OGCG-LABEL: test_mm_shuffle_pd
   // OGCG: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> 
<i32 1, i32 2>
   return _mm_shuffle_pd(A, B, 1);
-}
\ No newline at end of file
+}

>From d013d7c0b2f0ff4d3ce2782b3a10f732c54c48f2 Mon Sep 17 00:00:00 2001
From: Thibault-Monnier <[email protected]>
Date: Thu, 27 Nov 2025 20:35:31 +0100
Subject: [PATCH 3/3] Document emitPshufWord + rename

---
 clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp | 29 ++++++++++++++--------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
index e50957809e85d..5801aed5d8b11 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -68,12 +68,18 @@ static mlir::Value emitVectorFCmp(CIRGenBuilderTy &builder,
   return bitCast;
 }
 
+// Builds the VecShuffleOp for pshuflw and pshufhw x86 builtins.
 //
-static cir::VecShuffleOp emitPshufW(CIRGenFunction &cgf,
-                                    CIRGenBuilderTy &builder,
-                                    const mlir::Value vec,
-                                    const mlir::Value immediate,
-                                    const CallExpr *expr, const bool isLow) {
+// The vector is split into lanes of 8 word elements (16 bits). The lower or
+// upper half of each lane, controlled by `isLow`, is shuffled in the following
+// way: The immediate is truncated to 8 bits, separated into 4 2-bit fields. 
The
+// i-th field's value represents the resulting index of the i-th element in the
+// half lane after shuffling. The other half of the lane remains unchanged.
+static cir::VecShuffleOp emitPshufWord(CIRGenFunction &cgf,
+                                       CIRGenBuilderTy &builder,
+                                       const mlir::Value vec,
+                                       const mlir::Value immediate,
+                                       const CallExpr *expr, const bool isLow) 
{
   uint32_t imm = cgf.getZExtIntValueFromConstOp(immediate);
 
   auto vecTy = cast<cir::VectorType>(vec.getType());
@@ -99,9 +105,10 @@ static cir::VecShuffleOp emitPshufW(CIRGenFunction &cgf,
                                   ArrayRef(indices, numElts));
 }
 
+// Builds the shuffle mask for pshufd and shufpd/shufps x86 builtins.
 static llvm::SmallVector<int64_t, 16>
-computeMaskPshufDOrShufP(CIRGenFunction &cgf, const mlir::Value vec,
-                         uint32_t imm, const bool isShufP) {
+computeFullLaneShuffleMask(CIRGenFunction &cgf, const mlir::Value vec,
+                           uint32_t imm, const bool isShufP) {
   auto vecTy = cast<cir::VectorType>(vec.getType());
   unsigned numElts = vecTy.getSize();
   unsigned numLanes = cgf.cgm.getDataLayout().getTypeSizeInBits(vecTy) / 128;
@@ -584,12 +591,12 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned 
builtinID,
   case X86::BI__builtin_ia32_pshuflw:
   case X86::BI__builtin_ia32_pshuflw256:
   case X86::BI__builtin_ia32_pshuflw512: {
-    return emitPshufW(*this, builder, ops[0], ops[1], expr, true);
+    return emitPshufWord(*this, builder, ops[0], ops[1], expr, true);
   }
   case X86::BI__builtin_ia32_pshufhw:
   case X86::BI__builtin_ia32_pshufhw256:
   case X86::BI__builtin_ia32_pshufhw512: {
-    return emitPshufW(*this, builder, ops[0], ops[1], expr, false);
+    return emitPshufWord(*this, builder, ops[0], ops[1], expr, false);
   }
   case X86::BI__builtin_ia32_pshufd:
   case X86::BI__builtin_ia32_pshufd256:
@@ -602,7 +609,7 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned 
builtinID,
   case X86::BI__builtin_ia32_vpermilps512: {
     const uint32_t imm = getSExtIntValueFromConstOp(ops[1]);
     const llvm::SmallVector<int64_t, 16> mask =
-        computeMaskPshufDOrShufP(*this, ops[0], imm, false);
+        computeFullLaneShuffleMask(*this, ops[0], imm, false);
 
     return builder.createVecShuffle(getLoc(expr->getExprLoc()), ops[0], mask);
   }
@@ -614,7 +621,7 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned 
builtinID,
   case X86::BI__builtin_ia32_shufps512: {
     const uint32_t imm = getZExtIntValueFromConstOp(ops[2]);
     const llvm::SmallVector<int64_t, 16> mask =
-        computeMaskPshufDOrShufP(*this, ops[0], imm, true);
+        computeFullLaneShuffleMask(*this, ops[0], imm, true);
 
     return builder.createVecShuffle(getLoc(expr->getExprLoc()), ops[0], ops[1],
                                     mask);

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [CIR] Upstream vec shuffle builtins in CIR codegen (PR #169178)

Reply via email to