diff --git a/lib/CodeGen/CGBuiltin.cpp b/lib/CodeGen/CGBuiltin.cpp
index bf7d86f..2ca5bea 100644
--- a/lib/CodeGen/CGBuiltin.cpp
+++ b/lib/CodeGen/CGBuiltin.cpp
@@ -6092,10 +6092,15 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
   }
   // AVX2 broadcast
   case X86::BI__builtin_ia32_vbroadcastsi256: {
-    Value *VecTmp = CreateMemTemp(E->getArg(0)->getType());
-    Builder.CreateStore(Ops[0], VecTmp);
-    Value *F = CGM.getIntrinsic(Intrinsic::x86_avx2_vbroadcasti128);
-    return Builder.CreateCall(F, Builder.CreateBitCast(VecTmp, Int8PtrTy));
+    SmallVector<Constant *, 4> Idxs;
+    Idxs.push_back(Builder.getInt32(0));
+    Idxs.push_back(Builder.getInt32(1));
+    Idxs.push_back(Builder.getInt32(0));
+    Idxs.push_back(Builder.getInt32(1));
+
+    return Builder.CreateShuffleVector(Ops[0],
+                                       llvm::UndefValue::get(Ops[0]->getType()),
+                                       ConstantVector::get(Idxs));
   }
   // SSE comparison intrisics
   case X86::BI__builtin_ia32_cmpeqps:
diff --git a/test/CodeGen/avx2-builtins.c b/test/CodeGen/avx2-builtins.c
index 27ee91e..37353e4 100644
--- a/test/CodeGen/avx2-builtins.c
+++ b/test/CodeGen/avx2-builtins.c
@@ -612,7 +612,7 @@ __m256d test_mm256_broadcastsd_pd(__m128d a) {
 }
 
 __m256i test_mm256_broadcastsi128_si256(__m128i a) {
-  // CHECK: @llvm.x86.avx2.vbroadcasti128
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
   return _mm256_broadcastsi128_si256(a);
 }
 
