fghanim created this revision.
fghanim added a reviewer: jdoerfert.
Herald added subscribers: cfe-commits, guansong, yaxunl.
Herald added a project: clang.
fghanim added a parent revision: D79676: [Clang][OpenMP][OMPBuilder] Moving OMP 
allocation and cache creation code to OMPBuilderCBHelpers.

- Added support for Codegen `private` clause
- Added support for Codegen `firstprivate` Clause
- Added support for CodeGen of `copyin` Clause
- Added/moved code to support above tasks on the OMPIRBuilder


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D79677

Files:
  clang/lib/CodeGen/CGStmtOpenMP.cpp
  clang/lib/CodeGen/CodeGenFunction.h
  clang/test/OpenMP/parallel_copyin_codegen.cpp
  clang/test/OpenMP/parallel_firstprivate_codegen.cpp
  clang/test/OpenMP/parallel_private_codegen.cpp

Index: clang/test/OpenMP/parallel_private_codegen.cpp
===================================================================
--- clang/test/OpenMP/parallel_private_codegen.cpp
+++ clang/test/OpenMP/parallel_private_codegen.cpp
@@ -1,8 +1,9 @@
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck %s -check-prefixes=ALL,CHECK
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefixes=ALL,CHECK
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -DLAMBDA -triple %itanium_abi_triple -emit-llvm %s -o - | FileCheck -check-prefix=LAMBDA %s
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -fblocks -DBLOCKS -triple %itanium_abi_triple -emit-llvm %s -o - | FileCheck -check-prefix=BLOCKS %s
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-enable-irbuilder -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck %s -check-prefixes=ALL,IRBUILDER
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple x86_64-unknown-unknown -emit-pch -o %t %s
@@ -91,12 +92,12 @@
   }
 };
 
-// CHECK: [[SS_TY:%.+]] = type { i{{[0-9]+}}, i8
+// ALL: [[SS_TY:%.+]] = type { i{{[0-9]+}}, i8
 // LAMBDA: [[SS_TY:%.+]] = type { i{{[0-9]+}}, i8
 // BLOCKS: [[SS_TY:%.+]] = type { i{{[0-9]+}}, i8
-// CHECK: [[S_FLOAT_TY:%.+]] = type { float }
-// CHECK: [[S_INT_TY:%.+]] = type { i{{[0-9]+}} }
-// CHECK: [[SST_TY:%.+]] = type { i{{[0-9]+}} }
+// ALL: [[S_FLOAT_TY:%.+]] = type { float }
+// ALL: [[S_INT_TY:%.+]] = type { i{{[0-9]+}} }
+// ALL: [[SST_TY:%.+]] = type { i{{[0-9]+}} }
 template <typename T>
 T tmain() {
   S<T> test;
@@ -273,63 +274,93 @@
 #endif
 }
 
-// CHECK: define i{{[0-9]+}} @main()
-// CHECK: [[TEST:%.+]] = alloca [[S_FLOAT_TY]],
-// CHECK: call {{.*}} [[S_FLOAT_TY_DEF_CONSTR:@.+]]([[S_FLOAT_TY]]* [[TEST]])
-// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 0, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*)* [[MAIN_MICROTASK:@.+]] to void
-// CHECK: = call i{{.+}} [[TMAIN_INT:@.+]]()
-// CHECK: call void [[S_FLOAT_TY_DESTR:@.+]]([[S_FLOAT_TY]]*
-// CHECK: ret
+// ALL: define i{{[0-9]+}} @main()
+// ALL: [[TEST:%.+]] = alloca [[S_FLOAT_TY]],
+// ALL: call {{.*}} [[S_FLOAT_TY_DEF_CONSTR:@.+]]([[S_FLOAT_TY]]* [[TEST]])
+// ALL: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 0, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*)* [[MAIN_MICROTASK:@.+]] to void
+// ALL: = call i{{.+}} [[TMAIN_INT:@.+]]()
+// ALL: call void [[S_FLOAT_TY_DESTR:@.+]]([[S_FLOAT_TY]]*
+// ALL: ret
 //
-// CHECK: define internal void [[MAIN_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}})
-// CHECK: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}},
-// CHECK: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}],
-// CHECK: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_FLOAT_TY]]],
-// CHECK: [[VAR_PRIV:%.+]] = alloca [[S_FLOAT_TY]],
-// CHECK: [[SIVAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+// ALL: define internal void [[MAIN_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}})
+// ALL: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+// ALL: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}],
+// ALL: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_FLOAT_TY]]],
+// ALL: [[VAR_PRIV:%.+]] = alloca [[S_FLOAT_TY]],
+// ALL: [[SIVAR_PRIV:%.+]] = alloca i{{[0-9]+}},
 // CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_REF:%.+]]
-// CHECK-NOT: [[T_VAR_PRIV]]
-// CHECK-NOT: [[VEC_PRIV]]
-// CHECK: {{.+}}:
+// IRBUILDER: %{{[0-9]+}} = load i{{[0-9]+}}, i{{[0-9]+}}* [[GTID_ADDR]]
+// IRBUILDER: store i{{[0-9]+}} %{{[0-9]+}}, i{{[0-9]+}}* [[GTID_ADDR_REF:%.+]]
+// ALL-NOT: [[T_VAR_PRIV]]
+// ALL-NOT: [[VEC_PRIV]]
+// ALL: {{.+}}:
 // CHECK: [[S_ARR_PRIV_ITEM:%.+]] = phi [[S_FLOAT_TY]]*
-// CHECK: call {{.*}} [[S_FLOAT_TY_DEF_CONSTR]]([[S_FLOAT_TY]]* [[S_ARR_PRIV_ITEM]])
-// CHECK-NOT: [[T_VAR_PRIV]]
-// CHECK-NOT: [[VEC_PRIV]]
-// CHECK: call {{.*}} [[S_FLOAT_TY_DEF_CONSTR]]([[S_FLOAT_TY]]* [[VAR_PRIV]])
-// CHECK-DAG: call void [[S_FLOAT_TY_DESTR]]([[S_FLOAT_TY]]* [[VAR_PRIV]])
-// CHECK-DAG: call void [[S_FLOAT_TY_DESTR]]([[S_FLOAT_TY]]*
-// CHECK: ret void
+// IRBUILDER: {{.+}}:
+// IRBUILDER: [[S_ARR_PRIV_ITEM:%.+]] = phi [[S_FLOAT_TY]]*
+// ALL: call {{.*}} [[S_FLOAT_TY_DEF_CONSTR]]([[S_FLOAT_TY]]* [[S_ARR_PRIV_ITEM]])
+// ALL-NOT: [[T_VAR_PRIV]]
+// ALL-NOT: [[VEC_PRIV]]
+// ALL: call {{.*}} [[S_FLOAT_TY_DEF_CONSTR]]([[S_FLOAT_TY]]* [[VAR_PRIV]])
+// ALL-DAG: call void [[S_FLOAT_TY_DESTR]]([[S_FLOAT_TY]]* [[VAR_PRIV]])
+// ALL-DAG: call void [[S_FLOAT_TY_DESTR]]([[S_FLOAT_TY]]*
+// ALL: ret void
 
-// CHECK: define {{.*}} i{{[0-9]+}} [[TMAIN_INT]]()
-// CHECK: [[TEST:%.+]] = alloca [[S_INT_TY]],
-// CHECK: call {{.*}} [[S_INT_TY_DEF_CONSTR:@.+]]([[S_INT_TY]]* [[TEST]])
-// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 0, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*)* [[TMAIN_MICROTASK:@.+]] to void
-// CHECK: call void [[S_INT_TY_DESTR:@.+]]([[S_INT_TY]]*
-// CHECK: ret
-//
-// CHECK: define {{.+}} @{{.+}}([[SS_TY]]*
-// CHECK: store i{{[0-9]+}} 0, i{{[0-9]+}}* %
-// CHECK: store i8
+// ALL: define {{.*}} i{{[0-9]+}} [[TMAIN_INT]]()
+// ALL: [[TEST:%.+]] = alloca [[S_INT_TY]],
+// ALL: call {{.*}} [[S_INT_TY_DEF_CONSTR:@.+]]([[S_INT_TY]]* [[TEST]])
+// ALL: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 0, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*)* [[TMAIN_MICROTASK:@.+]] to void
+// ALL: call void [[S_INT_TY_DESTR:@.+]]([[S_INT_TY]]*
+// ALL: ret
+
+// IRBUILDER: define internal void [[TMAIN_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}})
+// IRBUILDER: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}}, align 128
+// IRBUILDER: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}], align 128
+// IRBUILDER: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_INT_TY]]], align 128
+// IRBUILDER: [[VAR_PRIV:%.+]] = alloca [[S_INT_TY]], align 128
+// IRBUILDER: %{{[0-9]+}} = load i{{[0-9]+}}, i{{[0-9]+}}* [[GTID_ADDR]]
+// IRBUILDER: store i{{[0-9]+}} %{{[0-9]+}}, i{{[0-9]+}}* [[GTID_ADDR_REF:%.+]]
+// IRBUILDER-NOT: [[T_VAR_PRIV]]
+// IRBUILDER-NOT: [[VEC_PRIV]]
+// IRBUILDER-NOT: [[SIVAR_PRIV]]
+// IRBUILDER: {{.+}}:
+// IRBUILDER: ret void
+// IRBUILDER: {{.+}}:
+// IRBUILDER: [[S_ARR_PRIV_ITEM:%.+]] = phi [[S_INT_TY]]*
+// IRBUILDER: call {{.*}} [[S_INT_TY_DEF_CONSTR]]([[S_INT_TY]]* [[S_ARR_PRIV_ITEM]])
+// IRBUILDER-NOT: [[T_VAR_PRIV]]
+// IRBUILDER-NOT: [[VEC_PRIV]]
+// IRBUILDER: call {{.*}} [[S_INT_TY_DEF_CONSTR]]([[S_INT_TY]]* [[VAR_PRIV]])
+// IRBUILDER-DAG: call void [[S_INT_TY_DESTR]]([[S_INT_TY]]* [[VAR_PRIV]])
+// IRBUILDER-DAG: call void [[S_INT_TY_DESTR]]([[S_INT_TY]]*
+
+// ALL: define {{.+}} @{{.+}}([[SS_TY]]*
+// ALL: store i{{[0-9]+}} 0, i{{[0-9]+}}* %
+// ALL: store i8
 // CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[SS_TY]]*)* [[SS_MICROTASK:@.+]] to void
-// CHECK: ret
+// IRBUILDER: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 0, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*)* [[SS_MICROTASK:@.+]] to void
+// ALL: ret
 
 // CHECK: define internal void [[SS_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [[SS_TY]]* %{{.+}})
-// CHECK: [[A_PRIV:%.+]] = alloca i{{[0-9]+}},
-// CHECK: [[B_PRIV:%.+]] = alloca i{{[0-9]+}},
-// CHECK: [[C_PRIV:%.+]] = alloca i{{[0-9]+}},
-// CHECK: store i{{[0-9]+}}* [[A_PRIV]], i{{[0-9]+}}** [[REFA:%.+]],
-// CHECK: store i{{[0-9]+}}* [[C_PRIV]], i{{[0-9]+}}** [[REFC:%.+]],
+// IRBUILDER: define internal void [[SS_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}})
+// ALL: [[A_PRIV:%.+]] = alloca i{{[0-9]+}},
+// ALL: [[B_PRIV:%.+]] = alloca i{{[0-9]+}},
+// ALL: [[C_PRIV:%.+]] = alloca i{{[0-9]+}},
+// ALL: store i{{[0-9]+}}* [[A_PRIV]], i{{[0-9]+}}** [[REFA:%.+]],
+// ALL: store i{{[0-9]+}}* [[C_PRIV]], i{{[0-9]+}}** [[REFC:%.+]],
+// IRBUILDER: {{.+}}:
+// IRBUILDER: ret void
 // CHECK-NEXT: [[A_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFA]],
-// CHECK-NEXT: [[A_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[A_PRIV]],
-// CHECK-NEXT: [[INC:%.+]] = add nsw i{{[0-9]+}} [[A_VAL]], 1
-// CHECK-NEXT: store i{{[0-9]+}} [[INC]], i{{[0-9]+}}* [[A_PRIV]],
-// CHECK-NEXT: [[B_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[B_PRIV]],
-// CHECK-NEXT: [[DEC:%.+]] = add nsw i{{[0-9]+}} [[B_VAL]], -1
-// CHECK-NEXT: store i{{[0-9]+}} [[DEC]], i{{[0-9]+}}* [[B_PRIV]],
-// CHECK-NEXT: [[C_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFC]],
-// CHECK-NEXT: [[C_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[C_PRIV]],
-// CHECK-NEXT: [[DIV:%.+]] = sdiv i{{[0-9]+}} [[C_VAL]], 1
-// CHECK-NEXT: store i{{[0-9]+}} [[DIV]], i{{[0-9]+}}* [[C_PRIV]],
+// IRBUILDER: [[A_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFA]],
+// ALL-NEXT: [[A_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[A_PRIV]],
+// ALL-NEXT: [[INC:%.+]] = add nsw i{{[0-9]+}} [[A_VAL]], 1
+// ALL-NEXT: store i{{[0-9]+}} [[INC]], i{{[0-9]+}}* [[A_PRIV]],
+// ALL-NEXT: [[B_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[B_PRIV]],
+// ALL-NEXT: [[DEC:%.+]] = add nsw i{{[0-9]+}} [[B_VAL]], -1
+// ALL-NEXT: store i{{[0-9]+}} [[DEC]], i{{[0-9]+}}* [[B_PRIV]],
+// ALL-NEXT: [[C_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFC]],
+// ALL-NEXT: [[C_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[C_PRIV]],
+// ALL-NEXT: [[DIV:%.+]] = sdiv i{{[0-9]+}} [[C_VAL]], 1
+// ALL-NEXT: store i{{[0-9]+}} [[DIV]], i{{[0-9]+}}* [[C_PRIV]],
 // CHECK-NEXT: ret void
 
 // CHECK: define internal void [[TMAIN_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}})
@@ -351,23 +382,30 @@
 // CHECK-DAG: call void [[S_INT_TY_DESTR]]([[S_INT_TY]]*
 // CHECK: ret void
 
-// CHECK: define {{.+}} @{{.+}}([[SST_TY]]* %
-// CHECK: store i{{[0-9]+}} 0, i{{[0-9]+}}* %
+// ALL: define {{.+}} @{{.+}}([[SST_TY]]* %
+// ALL: store i{{[0-9]+}} 0, i{{[0-9]+}}* %
 // CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[SST_TY]]*)* [[SST_MICROTASK:@.+]] to void
-// CHECK: ret
+// IRBUILDER: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 0, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*)* [[SST_MICROTASK:@.+]] to void
+// ALL: ret
 
 // CHECK: define internal void [[SST_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [[SST_TY]]* %{{.+}})
+// IRBUILDER: define internal void [[SST_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}})
 // CHECK: [[GTID_ADDR_PTR:%.+]] = alloca i32*,
 // CHECK: [[GTID_ADDR:%.+]] = load i32*, i32** [[GTID_ADDR_PTR]],
 // CHECK: [[GTID:%.+]] = load i32, i32* [[GTID_ADDR]],
-// CHECK: [[A_VOID_PTR:%.+]] = call i8* @__kmpc_alloc(i32 [[GTID]], i64 4, i8* inttoptr (i64 2 to i8*))
-// CHECK: [[A_PRIV:%.+]] = bitcast i8* [[A_VOID_PTR]] to i32*
-// CHECK: store i{{[0-9]+}}* [[A_PRIV]], i{{[0-9]+}}** [[REF:%.+]],
+// IRBUILDER: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%{{.+}}* @{{.+}})
+// ALL: [[A_VOID_PTR:%.+]] = call i8* @__kmpc_alloc(i32 [[GTID]], i64 4, i8* inttoptr (i64 2 to i8*))
+// IRBUILDER: [[GTID2:%.+]] = call i32 @__kmpc_global_thread_num(%{{.+}}* @{{.+}})
+// ALL: [[A_PRIV:%.+]] = bitcast i8* [[A_VOID_PTR]] to i32*
+// ALL: store i{{[0-9]+}}* [[A_PRIV]], i{{[0-9]+}}** [[REF:%.+]],
+// IRBUILDER: ret void
 // CHECK-NEXT: [[A_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REF]],
-// CHECK-NEXT: [[A_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[A_PRIV]],
-// CHECK-NEXT: [[INC:%.+]] = add nsw i{{[0-9]+}} [[A_VAL]], 1
-// CHECK-NEXT: store i{{[0-9]+}} [[INC]], i{{[0-9]+}}* [[A_PRIV]],
+// IRBUILDER: [[A_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REF]],
+// ALL-NEXT: [[A_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[A_PRIV]],
+// ALL-NEXT: [[INC:%.+]] = add nsw i{{[0-9]+}} [[A_VAL]], 1
+// ALL-NEXT: store i{{[0-9]+}} [[INC]], i{{[0-9]+}}* [[A_PRIV]],
 // CHECK-NEXT: call void @__kmpc_free(i32 [[GTID]], i8* [[A_VOID_PTR]], i8* inttoptr (i64 2 to i8*))
+// IRBUILDER: call void @__kmpc_free(i32 [[GTID2]], i8* [[A_VOID_PTR]], i8* inttoptr (i64 2 to i8*))
 // CHECK-NEXT: ret void
 
 #endif
Index: clang/test/OpenMP/parallel_firstprivate_codegen.cpp
===================================================================
--- clang/test/OpenMP/parallel_firstprivate_codegen.cpp
+++ clang/test/OpenMP/parallel_firstprivate_codegen.cpp
@@ -1,8 +1,11 @@
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-32
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s -check-prefixes=ALL,ALL-32,CHECK,CHECK-32
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-pc-linux-gnu -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-32
+// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefixes=ALL,ALL-32,CHECK,CHECK-32
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -DLAMBDA -triple i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck -check-prefix=LAMBDA -check-prefix=LAMBDA-32 %s
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -fblocks -DBLOCKS -triple i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck -check-prefix=BLOCKS -check-prefix=BLOCKS-32 %s
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-enable-irbuilder -x c++ -triple i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s -check-prefixes=ALL,ALL-32,IRBUILDER,IRBUILDER-32
+// RUN: %clang_cc1 -fopenmp  -fopenmp-enable-irbuilder -x c++ -std=c++11 -triple i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp  -fopenmp-enable-irbuilder -x c++ -triple i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefixes=ALL,ALL-32,IRBUILDER,IRBUILDER-32
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple i386-pc-linux-gnu -emit-pch -o %t %s
@@ -11,11 +14,14 @@
 // RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -fblocks -DBLOCKS -triple i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s
 // SIMD-ONLY0-NOT: {{__kmpc|__tgt}}
 
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-64
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s -check-prefixes=ALL,ALL-64,CHECK,CHECK-64
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-pc-linux-gnu -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-64
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefixes=ALL,ALL-64,CHECK,CHECK-64
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -DLAMBDA -triple x86_64-pc-linux-gnu -emit-llvm %s -o - | FileCheck -check-prefix=LAMBDA -check-prefix=LAMBDA-64 %s
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -fblocks -DBLOCKS -triple x86_64-pc-linux-gnu -emit-llvm %s -o - | FileCheck -check-prefix=BLOCKS -check-prefix=BLOCKS-64 %s
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-enable-irbuilder -x c++ -triple x86_64-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s -check-prefixes=ALL,ALL-64,IRBUILDER,IRBUILDER-64
+// RUN: %clang_cc1 -fopenmp -fopenmp-enable-irbuilder -x c++ -std=c++11 -triple x86_64-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-enable-irbuilder -x c++ -triple x86_64-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefixes=ALL,ALL-64,IRBUILDER,IRBUILDER-64
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple x86_64-pc-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY1 %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple x86_64-pc-linux-gnu -emit-pch -o %t %s
@@ -62,6 +68,7 @@
   int e[4];
   SS(int &d) : a(0), b(0), c(d) {
 #pragma omp parallel firstprivate(a, b, c, e)
+
 #ifdef LAMBDA
     [&]() {
       ++this->a, --b, (this)->c /= 1;
@@ -119,12 +126,12 @@
   ~S() {}
 };
 
-// CHECK: [[SS_TY:%.+]] = type { i{{[0-9]+}}, i8
+// ALL: [[SS_TY:%.+]] = type { i{{[0-9]+}}, i8
 // LAMBDA: [[SS_TY:%.+]] = type { i{{[0-9]+}}, i8
 // BLOCKS: [[SS_TY:%.+]] = type { i{{[0-9]+}}, i8
-// CHECK-DAG: [[S_FLOAT_TY:%.+]] = type { float }
-// CHECK-DAG: [[S_INT_TY:%.+]] = type { i{{[0-9]+}} }
-// CHECK-DAG: [[ST_TY:%.+]] = type { i{{[0-9]+}}, i{{[0-9]+}} }
+// ALL-DAG: [[S_FLOAT_TY:%.+]] = type { float }
+// ALL-DAG: [[S_INT_TY:%.+]] = type { i{{[0-9]+}} }
+// ALL-DAG: [[ST_TY:%.+]] = type { i{{[0-9]+}}, i{{[0-9]+}} }
 
 template <typename T>
 T tmain() {
@@ -342,43 +349,73 @@
 #endif
 }
 
-// CHECK: define {{.*}}i{{[0-9]+}} @main()
-// CHECK: [[TEST:%.+]] = alloca [[S_FLOAT_TY]],
-// CHECK: [[T_VAR:%.+]] = alloca i32,
-// CHECK: [[T_VARCAST:%.+]] = alloca [[iz:i64|i32]],
-// CHECK: [[SIVARCAST:%.+]] = alloca [[iz]],
-// CHECK: [[A:%.+]] = alloca i32,
-// CHECK: [[T_VARCAST1:%.+]] = alloca [[iz:i64|i32]],
-// CHECK: call {{.*}} [[S_FLOAT_TY_DEF_CONSTR:@.+]]([[S_FLOAT_TY]]* [[TEST]])
-// CHECK: [[T_VARVAL:%.+]] = load i32, i32* [[T_VAR]],
-// CHECK-64: [[T_VARCONV:%.+]] = bitcast i64* [[T_VARCAST]] to i32*
-// CHECK-64: store i32 [[T_VARVAL]], i32* [[T_VARCONV]],
-// CHECK-32: store i32 [[T_VARVAL]], i32* [[T_VARCAST]],
-// CHECK: [[T_VARPVT:%.+]] = load [[iz]], [[iz]]* [[T_VARCAST]],
-// CHECK: [[SIVARVAL:%.+]] = load i32, i32* @{{.+}},
-// CHECK-64: [[SIVARCONV:%.+]] = bitcast i64* [[SIVARCAST]] to i32*
-// CHECK-64: store i32 [[SIVARVAL]], i32* [[SIVARCONV]],
-// CHECK-32: store i32 [[SIVARVAL]], i32* [[SIVARCAST]],
-// CHECK: [[SIVARPVT:%.+]] = load [[iz]], [[iz]]* [[SIVARCAST]],
+// ALL: define {{.*}}i{{[0-9]+}} @main()
+// ALL: [[TEST:%.+]] = alloca [[S_FLOAT_TY]],
+// ALL: [[T_VAR:%.+]] = alloca i32,
+// ALL: [[T_VARCAST:%.+]] = alloca [[iz:i64|i32]],
+// ALL: [[SIVARCAST:%.+]] = alloca [[iz]],
+// ALL: [[A:%.+]] = alloca i32,
+// ALL: [[T_VARCAST1:%.+]] = alloca [[iz:i64|i32]],
+// ALL: call {{.*}} [[S_FLOAT_TY_DEF_CONSTR:@.+]]([[S_FLOAT_TY]]* [[TEST]])
+// ALL: [[T_VARVAL:%.+]] = load i32, i32* [[T_VAR]],
+// ALL-64: [[T_VARCONV:%.+]] = bitcast i64* [[T_VARCAST]] to i32*
+// ALL-64: store i32 [[T_VARVAL]], i32* [[T_VARCONV]],
+// ALL-32: store i32 [[T_VARVAL]], i32* [[T_VARCAST]],
+// ALL: [[T_VARPVT:%.+]] = load [[iz]], [[iz]]* [[T_VARCAST]],
+// ALL: [[SIVARVAL:%.+]] = load i32, i32* @{{.+}},
+// ALL-64: [[SIVARCONV:%.+]] = bitcast i64* [[SIVARCAST]] to i32*
+// ALL-64: store i32 [[SIVARVAL]], i32* [[SIVARCONV]],
+// ALL-32: store i32 [[SIVARVAL]], i32* [[SIVARCAST]],
+// ALL: [[SIVARPVT:%.+]] = load [[iz]], [[iz]]* [[SIVARCAST]],
 // CHECK: call {{.*}}void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 5, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [2 x i32]*, [[iz]], [2 x [[S_FLOAT_TY]]]*, [[S_FLOAT_TY]]*, i{{[0-9]+}})* [[MAIN_MICROTASK:@.+]] to void {{.*}}[[iz]] [[T_VARPVT]],{{.*}}[[iz]] [[SIVARPVT]]
-// CHECK: [[T_VARVAL:%.+]] = load i32, i32* [[T_VAR]],
-// CHECK-64: [[T_VARCONV:%.+]] = bitcast i64* [[T_VARCAST1]] to i32*
-// CHECK-64: store i32 [[T_VARVAL]], i32* [[T_VARCONV]],
-// CHECK-32: store i32 [[T_VARVAL]], i32* [[T_VARCAST1]],
-// CHECK: [[T_VARPVT:%.+]] = load [[iz]], [[iz]]* [[T_VARCAST1]],
-// CHECK: call {{.*}}void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[iz]])* [[MAIN_MICROTASK1:@.+]] to void {{.*}}[[iz]] [[T_VARPVT]])
-// CHECK: = call {{.*}}i{{.+}} [[TMAIN_INT:@.+]]()
-// CHECK: call {{.*}} [[S_FLOAT_TY_DESTR:@.+]]([[S_FLOAT_TY]]*
-// CHECK: ret
+// IRBUILDER: call {{.*}}void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 5, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[iz]], [2 x i32]*, [2 x [[S_FLOAT_TY]]]*, [[S_FLOAT_TY]]*, i{{[0-9]+}})* [[MAIN_MICROTASK:@.+]] to void {{.*}}{{.*}}[[iz]] [[T_VARPVT]],{{.*}}[[iz]] [[SIVARPVT]]
+// ALL: [[T_VARVAL:%.+]] = load i32, i32* [[T_VAR]],
+// ALL-64: [[T_VARCONV:%.+]] = bitcast i64* [[T_VARCAST1]] to i32*
+// ALL-64: store i32 [[T_VARVAL]], i32* [[T_VARCONV]],
+// ALL-32: store i32 [[T_VARVAL]], i32* [[T_VARCAST1]],
+// ALL: [[T_VARPVT:%.+]] = load [[iz]], [[iz]]* [[T_VARCAST1]],
+// ALL: call {{.*}}void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[iz]])* [[MAIN_MICROTASK1:@.+]] to void {{.*}}[[iz]] [[T_VARPVT]])
+// ALL: = call {{.*}}i{{.+}} [[TMAIN_INT:@.+]]()
+// ALL: call {{.*}} [[S_FLOAT_TY_DESTR:@.+]]([[S_FLOAT_TY]]*
+// ALL: ret
+//
+// IRBUILDER:    define internal void [[MAIN_MICROTASK1]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [[iz]] [[T_VAR:%.+]])
+// IRBUILDER: 	 [[GTID_LOCAL:%.+]] = alloca i32
+// IRBUILDER: 	 [[GTID_LD:%.+]] = load i32, i32* [[GTID_ADDR]]
+// IRBUILDER: 	 store i32 [[GTID_LD]], i32* [[GTID_LOCAL]]
+// IRBUILDER: 	 [[GTID:%.+]] = load i32, i32* [[GTID_LOCAL]]
+// IRBUILDER:    store [[iz]] [[T_VAR]], [[iz]]* [[T_VAR_ADDR:%.+]],
+// IRBUILDER-64: [[BC:%.+]] = bitcast [[iz]]* [[T_VAR_ADDR]] to i32*
+// IRBUILDER:		 [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%{{.+}}* @{{.+}})
+// IRBUILDER:    [[T_VAR_VOID_PTR:%.+]] = call i8* @__kmpc_alloc(i32 [[GTID]], [[iz]] 4, i8* inttoptr ([[iz]] 1 to i8*))
+// IRBUILDER:		 [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%{{.+}}* @{{.+}})
+// IRBUILDER:    [[T_VAR_PRIV:%.+]] = bitcast i8* [[T_VAR_VOID_PTR]] to i32*
+// IRBUILDER-32: [[T_VAR_VAL:%.+]] = load i32, i32* [[T_VAR_ADDR]],
+// IRBUILDER-64: [[T_VAR_VAL:%.+]] = load i32, i32* [[BC]],
+// IRBUILDER:    store i32 [[T_VAR_VAL]], i32* [[T_VAR_PRIV]],
+// IRBUILDER:    ret void
+// IRBUILDER:    store i32 0, i32* [[T_VAR_PRIV]],
+// IRBUILDER:    call void @__kmpc_free(i32 [[GTID]], i8* [[T_VAR_VOID_PTR]], i8* inttoptr ([[iz]] 1 to i8*))
+
 //
 // CHECK: define internal {{.*}}void [[MAIN_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [2 x i32]* dereferenceable(8) %{{.+}}, [[iz]] {{.*}}%{{.+}}, [2 x [[S_FLOAT_TY]]]* dereferenceable(8) %{{.+}}, [[S_FLOAT_TY]]* dereferenceable(4) %{{.+}}, [[iz]] {{.*}}[[SIVAR:%.+]])
-// CHECK: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+// IRBUILDER: define internal {{.*}}void [[MAIN_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [[iz]] {{.*}}%{{.+}}, [2 x i32]* [[VEC_REF:%.+]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_REF:%.+]], [[S_FLOAT_TY]]* %{{.+}}, [[iz]] {{.*}}[[SIVAR:%.+]])
+// IRBUILDER: 	 [[GTID_LOCAL:%.+]] = alloca i32
+// ALL: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}},
 // CHECK: [[SIVAR7_PRIV:%.+]] = alloca i{{[0-9]+}},
-// CHECK: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}],
-// CHECK: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_FLOAT_TY]]],
-// CHECK: [[VAR_PRIV:%.+]] = alloca [[S_FLOAT_TY]],
-// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
+// ALL: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}],
+// ALL: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_FLOAT_TY]]],
+// IRBUILDER: [[VAR_ADDR:%.+]] = alloca [[S_FLOAT_TY]]*,
+// ALL: [[VAR_PRIV:%.+]] = alloca [[S_FLOAT_TY]],
+// IRBUILDER: [[SIVAR7_PRIV:%.+]] = alloca i{{[0-9]+}},
+// IRBUILDER: [[GTID_LD:%.+]] = load i32, i32* [[GTID_ADDR]]
+// IRBUILDER: store i32 [[GTID_LD]], i32* [[GTID_LOCAL]]
+// IRBUILDER: [[GTID:%.+]] = load i32, i32* [[GTID_LOCAL]]
+// IRBUILDER-64: [[T_VAR_CONV:%.+]] = bitcast i64* [[T_VAR_PRIV]] to i32*
+// IRBUILDER: [[VEC_DEST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
+// IRBUILDER: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_REF]] to i8*
 
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
 // CHECK: [[VEC_REF:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** %
 // CHECK-NOT: load i{{[0-9]+}}*, i{{[0-9]+}}** %
 // CHECK-64: [[T_VAR_CONV:%.+]] = bitcast i64* [[T_VAR_PRIV]] to i32*
@@ -386,31 +423,35 @@
 // CHECK: [[VAR_REF:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** %
 // CHECK-NOT: load i{{[0-9]+}}*, i{{[0-9]+}}** %
 // CHECK-64: [[SIVAR7_CONV:%.+]] = bitcast i64* [[SIVAR7_PRIV]] to i32*
+
 // CHECK: [[VEC_DEST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
 // CHECK: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_REF]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST]], i8* align {{[0-9]+}} [[VEC_SRC]],
-// CHECK: [[S_ARR_PRIV_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
-// CHECK: [[S_ARR_BEGIN:%.+]] = bitcast [2 x [[S_FLOAT_TY]]]* [[S_ARR_REF]] to [[S_FLOAT_TY]]*
-// CHECK: [[S_ARR_PRIV_END:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_PRIV_BEGIN]], i{{[0-9]+}} 2
-// CHECK: [[IS_EMPTY:%.+]] = icmp eq [[S_FLOAT_TY]]* [[S_ARR_PRIV_BEGIN]], [[S_ARR_PRIV_END]]
-// CHECK: br i1 [[IS_EMPTY]], label %[[S_ARR_BODY_DONE:.+]], label %[[S_ARR_BODY:.+]]
-// CHECK: [[S_ARR_BODY]]
-// CHECK: call {{.*}} [[ST_TY_DEFAULT_CONSTR:@.+]]([[ST_TY]]* [[ST_TY_TEMP:%.+]])
-// CHECK: call {{.*}} [[S_FLOAT_TY_COPY_CONSTR:@.+]]([[S_FLOAT_TY]]* {{.+}}, [[S_FLOAT_TY]]* {{.+}}, [[ST_TY]]* [[ST_TY_TEMP]])
-// CHECK: call {{.*}} [[ST_TY_DESTR:@.+]]([[ST_TY]]* [[ST_TY_TEMP]])
-// CHECK: br i1 {{.+}}, label %{{.+}}, label %[[S_ARR_BODY]]
-// CHECK: call {{.*}} [[ST_TY_DEFAULT_CONSTR]]([[ST_TY]]* [[ST_TY_TEMP:%.+]])
-// CHECK: call {{.*}} [[S_FLOAT_TY_COPY_CONSTR]]([[S_FLOAT_TY]]* [[VAR_PRIV]], [[S_FLOAT_TY]]* {{.*}} [[VAR_REF]], [[ST_TY]]* [[ST_TY_TEMP]])
+// ALL: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST]], i8* align {{[0-9]+}} [[VEC_SRC]],
+// ALL: [[S_ARR_PRIV_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// ALL: [[S_ARR_BEGIN:%.+]] = bitcast [2 x [[S_FLOAT_TY]]]* [[S_ARR_REF]] to [[S_FLOAT_TY]]*
+// ALL: [[S_ARR_PRIV_END:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_PRIV_BEGIN]], i{{[0-9]+}} 2
+// ALL: [[IS_EMPTY:%.+]] = icmp eq [[S_FLOAT_TY]]* [[S_ARR_PRIV_BEGIN]], [[S_ARR_PRIV_END]]
+// ALL: br i1 [[IS_EMPTY]], label %[[S_ARR_BODY_DONE:.+]], label %[[S_ARR_BODY:.+]]
+// IRBUILDER: ret void
+// ALL: [[S_ARR_BODY]]
+// ALL: call {{.*}} [[ST_TY_DEFAULT_CONSTR:@.+]]([[ST_TY]]* [[ST_TY_TEMP:%.+]])
+// ALL: call {{.*}} [[S_FLOAT_TY_COPY_CONSTR:@.+]]([[S_FLOAT_TY]]* {{.+}}, [[S_FLOAT_TY]]* {{.+}}, [[ST_TY]]* [[ST_TY_TEMP]])
+// ALL: call {{.*}} [[ST_TY_DESTR:@.+]]([[ST_TY]]* [[ST_TY_TEMP]])
+// ALL: br i1 {{.+}}, label %{{.+}}, label %[[S_ARR_BODY]]
+// IRBUILDER: [[VAR_REF:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** %
+// ALL: call {{.*}} [[ST_TY_DEFAULT_CONSTR]]([[ST_TY]]* [[ST_TY_TEMP:%.+]])
+// ALL: call {{.*}} [[S_FLOAT_TY_COPY_CONSTR]]([[S_FLOAT_TY]]* [[VAR_PRIV]], [[S_FLOAT_TY]]* {{.*}} [[VAR_REF]], [[ST_TY]]* [[ST_TY_TEMP]])
+// IRBUILDER-64: [[SIVAR7_CONV:%.+]] = bitcast i64* [[SIVAR7_PRIV]] to i32*
 // CHECK: call {{.*}} [[ST_TY_DESTR]]([[ST_TY]]* [[ST_TY_TEMP]])
 
-// CHECK-64: store i{{[0-9]+}} 2, i{{[0-9]+}}* [[SIVAR7_CONV]],
-// CHECK-32: store i{{[0-9]+}} 2, i{{[0-9]+}}* [[SIVAR7_PRIV]],
+// ALL-64: store i{{[0-9]+}} 2, i{{[0-9]+}}* [[SIVAR7_CONV]],
+// ALL-32: store i{{[0-9]+}} 2, i{{[0-9]+}}* [[SIVAR7_PRIV]],
+// IRBUILDER: call {{.*}} [[ST_TY_DESTR]]([[ST_TY]]* [[ST_TY_TEMP]])
 
 // CHECK-DAG: call {{.*}} [[S_FLOAT_TY_DESTR]]([[S_FLOAT_TY]]* [[VAR_PRIV]])
-// CHECK-DAG: call {{.*}} [[S_FLOAT_TY_DESTR]]([[S_FLOAT_TY]]*
+// ALL-DAG: call {{.*}} [[S_FLOAT_TY_DESTR]]([[S_FLOAT_TY]]*
 // CHECK: ret void
 
-
 // CHECK:    define internal void [[MAIN_MICROTASK1]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [[iz]] [[T_VAR:%.+]])
 // CHECK:    [[GTID_ADDR:%.+]] = alloca i32*,
 // CHECK:    store [[iz]] [[T_VAR]], [[iz]]* [[T_VAR_ADDR:%.+]],
@@ -426,13 +467,13 @@
 // CHECK:    call void @__kmpc_free(i32 [[GTID]], i8* [[T_VAR_VOID_PTR]], i8* inttoptr ([[iz]] 1 to i8*))
 // CHECK:    ret void
 
-
-// CHECK: define {{.*}} i{{[0-9]+}} [[TMAIN_INT]]()
-// CHECK: [[TEST:%.+]] = alloca [[S_INT_TY]],
-// CHECK: call {{.*}} [[S_INT_TY_DEF_CONSTR:@.+]]([[S_INT_TY]]* [[TEST]])
+// ALL: define {{.*}} i{{[0-9]+}} [[TMAIN_INT]]()
+// ALL: [[TEST:%.+]] = alloca [[S_INT_TY]],
+// ALL: call {{.*}} [[S_INT_TY_DEF_CONSTR:@.+]]([[S_INT_TY]]* [[TEST]])
 // CHECK: call {{.*}}void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 4, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [2 x i32]*, i32*, [2 x [[S_INT_TY]]]*, [[S_INT_TY]]*)* [[TMAIN_MICROTASK:@.+]] to void
-// CHECK: call {{.*}} [[S_INT_TY_DESTR:@.+]]([[S_INT_TY]]*
-// CHECK: ret
+// IRBUILDER: call {{.*}}void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 4, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, i32*, [2 x i32]*, [2 x [[S_INT_TY]]]*, [[S_INT_TY]]*)* [[TMAIN_MICROTASK:@.+]] to void
+// ALL: call {{.*}} [[S_INT_TY_DESTR:@.+]]([[S_INT_TY]]*
+// ALL: ret i{{[0-9]+}}
 //
 // CHECK: define {{.+}} @{{.+}}([[SS_TY]]*
 // CHECK: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 0
@@ -446,14 +487,70 @@
 // CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 5, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[SS_TY]]*, [[iz]], [[iz]], [[iz]], [4 x i32]*)* [[SS_MICROTASK:@.+]] to void
 // CHECK: ret
 
+// IRBUILDER: define internal {{.*}}void [[TMAIN_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, i32* %{{.+}}, [2 x i32]* %{{.+}}, [2 x [[S_INT_TY]]]* %{{.+}}, [[S_INT_TY]]* %{{.+}})
+// IRBUILDER: [[GTID_LOCAL:%.+]] = alloca i32
+// IRBUILDER: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}}*, align 128
+// IRBUILDER: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}], align 128
+// IRBUILDER: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_INT_TY]]], align 128
+// IRBUILDER: [[VAR_PRIV:%.+]] = alloca [[S_INT_TY]], align 128
+// IRBUILDER: [[GTID_LD:%.+]] = load i32, i32* [[GTID_ADDR]]
+// IRBUILDER: store i32 [[GTID_LD]], i32* [[GTID_LOCAL]]
+// IRBUILDER: [[GTID:%.+]] = load i32, i32* [[GTID_LOCAL]]
+// IRBUILDER: [[T_VAR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** %
+// IRBUILDER: [[VEC_DEST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
+// IRBUILDER: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_REF]] to i8*
+// IRBUILDER: call void @llvm.memcpy.{{.+}}(i8* align 128 [[VEC_DEST]], i8* align 128 [[VEC_SRC]], i{{[0-9]+}} {{[0-9]+}}, i1
+// IRBUILDER: [[S_ARR_PRIV_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// IRBUILDER: [[S_ARR_BEGIN:%.+]] = bitcast [2 x [[S_INT_TY]]]* [[S_ARR_REF]] to [[S_INT_TY]]*
+// IRBUILDER: [[S_ARR_PRIV_END:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_PRIV_BEGIN]], i{{[0-9]+}} 2
+// IRBUILDER: [[IS_EMPTY:%.+]] = icmp eq [[S_INT_TY]]* [[S_ARR_PRIV_BEGIN]], [[S_ARR_PRIV_END]]
+// IRBUILDER: br i1 [[IS_EMPTY]], label %[[S_ARR_BODY_DONE:.+]], label %[[S_ARR_BODY:.+]]
+
+// IRBUILDER: ret void
+
+// IRBUILDER: [[S_ARR_BODY]]
+// IRBUILDER: call {{.*}} [[ST_TY_DEFAULT_CONSTR]]([[ST_TY]]* [[ST_TY_TEMP:%.+]])
+// IRBUILDER: call {{.*}} [[S_INT_TY_COPY_CONSTR:@.+]]([[S_INT_TY]]* {{.+}}, [[S_INT_TY]]* {{.+}}, [[ST_TY]]* [[ST_TY_TEMP]])
+// IRBUILDER: call {{.*}} [[ST_TY_DESTR]]([[ST_TY]]* [[ST_TY_TEMP]])
+// IRBUILDER: br i1 {{.+}}, label %{{.+}}, label %[[S_ARR_BODY]]
+// IRBUILDER: [[VAR_REF:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** %
+// IRBUILDER: call {{.*}} [[ST_TY_DEFAULT_CONSTR]]([[ST_TY]]* [[ST_TY_TEMP:%.+]])
+// IRBUILDER: call {{.*}} [[S_INT_TY_COPY_CONSTR]]([[S_INT_TY]]* [[VAR_PRIV]], [[S_INT_TY]]* {{.*}} [[VAR_REF]], [[ST_TY]]* [[ST_TY_TEMP]])
+// IRBUILDER: [[T_VAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR_REF]],
+// IRBUILDER: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}*
+// IRBUILDER: call {{.*}} [[ST_TY_DESTR]]([[ST_TY]]* [[ST_TY_TEMP]])
+// IRBUILDER-NOT: call {{.*}}void @__kmpc_barrier(
+
+// IRBUILDER-DAG: call {{.*}} [[S_INT_TY_DESTR]]([[S_INT_TY]]*
+
+// IRBUILDER: define {{.+}} @{{.+}}([[SS_TY]]*
+// IRBUILDER: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 0
+// IRBUILDER: store i{{[0-9]+}} 0, i{{[0-9]+}}* %
+// IRBUILDER: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 1
+// IRBUILDER: store i8
+// IRBUILDER: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 2
+// IRBUILDER: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 0
+// IRBUILDER: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 1
+// IRBUILDER: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 2
+// IRBUILDER: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 6, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, i32**, [[iz]], [[iz]], i32**, [[iz]], [4 x i32]**)* [[SS_MICROTASK:@.+]] to void
+// IRBUILDER: ret
+
 // CHECK: define internal void [[SS_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [[SS_TY]]* %{{.+}}, [[iz]] {{.+}}, [[iz]] {{.+}}, [[iz]] {{.+}}, [4 x i{{[0-9]+}}]* {{.+}})
-// CHECK: [[A_PRIV:%.+]] = alloca i{{[0-9]+}},
-// CHECK: [[B_PRIV:%.+]] = alloca i{{[0-9]+}},
-// CHECK: [[C_PRIV:%.+]] = alloca i{{[0-9]+}},
-// CHECK: [[E_PRIV:%.+]] = alloca [4 x i{{[0-9]+}}],
-// CHECK: store i{{[0-9]+}} {{.+}}, i{{[0-9]+}}* [[A_PRIV]]
-// CHECK: store i{{[0-9]+}} {{.+}}, i{{[0-9]+}}* [[B_PRIV]]
-// CHECK: store i{{[0-9]+}} {{.+}}, i{{[0-9]+}}* [[C_PRIV]]
+// IRBUILDER: define internal void [[SS_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, i32** %{{.+}}, [[iz]] {{.+}}, [[iz]] {{.+}}, i32** %{{.+}}, [[iz]] {{.+}}, [4 x i{{[0-9]+}}]** {{.+}})
+// ALL: [[A_PRIV:%.+]] = alloca i{{[0-9]+}},
+// ALL: [[B_PRIV:%.+]] = alloca i{{[0-9]+}},
+// ALL: [[C_PRIV:%.+]] = alloca i{{[0-9]+}},
+// ALL: [[E_PRIV:%.+]] = alloca [4 x i{{[0-9]+}}],
+// ALL: store i{{[0-9]+}} {{.+}}, i{{[0-9]+}}* [[A_PRIV]]
+// IRBUILDER-64: [[A_CONV:%.+]] = bitcast i64* [[A_PRIV:%.+]] to i32*
+// IRBUILDER-64: store i32* [[A_CONV]], i32** [[REFA:%.+]],
+// IRBUILDER-32: store i32* [[A_PRIV]], i32** [[REFA:%.+]],
+// ALL: store i{{[0-9]+}} {{.+}}, i{{[0-9]+}}* [[B_PRIV]]
+// IRBUILDER-64: [[B_CONV:%.+]] = bitcast i64* [[B_PRIV:%.+]] to i32*
+// ALL: store i{{[0-9]+}} {{.+}}, i{{[0-9]+}}* [[C_PRIV]]
+// IRBUILDER-64: [[C_CONV:%.+]] = bitcast i64* [[C_PRIV:%.+]] to i32*
+// IRBUILDER-64: store i32* [[C_CONV]], i32** [[REFC:%.+]],
+// IRBUILDER-32: store i32* [[C_PRIV]], i32** [[REFC:%.+]],
 // CHECK-64: [[A_CONV:%.+]] = bitcast i64* [[A_PRIV:%.+]] to i32*
 // CHECK-64: [[B_CONV:%.+]] = bitcast i64* [[B_PRIV:%.+]] to i32*
 // CHECK-64: [[C_CONV:%.+]] = bitcast i64* [[C_PRIV:%.+]] to i32*
@@ -461,26 +558,28 @@
 // CHECK-32: store i32* [[A_PRIV]], i32** [[REFA:%.+]],
 // CHECK-64: store i32* [[C_CONV]], i32** [[REFC:%.+]],
 // CHECK-32: store i32* [[C_PRIV]], i32** [[REFC:%.+]],
-// CHECK: bitcast [4 x i{{[0-9]+}}]* [[E_PRIV]] to i8*
-// CHECK: bitcast [4 x i{{[0-9]+}}]* %{{.+}} to i8*
-// CHECK: call void @llvm.memcpy
-// CHECK: store [4 x i{{[0-9]+}}]* [[E_PRIV]], [4 x i{{[0-9]+}}]** [[REFE:%.+]],
+// ALL: bitcast [4 x i{{[0-9]+}}]* [[E_PRIV]] to i8*
+// ALL: bitcast [4 x i{{[0-9]+}}]* %{{.+}} to i8*
+// ALL: call void @llvm.memcpy
+// ALL: store [4 x i{{[0-9]+}}]* [[E_PRIV]], [4 x i{{[0-9]+}}]** [[REFE:%.+]],
+// IRBUILDER: ret void
+// IRBUILDER: [[A_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFA]],
 // CHECK-NEXT: [[A_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFA]],
-// CHECK-NEXT: [[A_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[A_PRIV]],
-// CHECK-NEXT: [[INC:%.+]] = add nsw i{{[0-9]+}} [[A_VAL]], 1
-// CHECK-NEXT: store i{{[0-9]+}} [[INC]], i{{[0-9]+}}* [[A_PRIV]],
-// CHECK-64-NEXT: [[B_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[B_CONV]],
-// CHECK-32-NEXT: [[B_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[B_PRIV]],
-// CHECK-NEXT: [[DEC:%.+]] = add nsw i{{[0-9]+}} [[B_VAL]], -1
-// CHECK-64-NEXT: store i{{[0-9]+}} [[DEC]], i{{[0-9]+}}* [[B_CONV]],
-// CHECK-32-NEXT: store i{{[0-9]+}} [[DEC]], i{{[0-9]+}}* [[B_PRIV]],
-// CHECK-NEXT: [[C_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFC]],
-// CHECK-NEXT: [[C_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[C_PRIV]],
-// CHECK-NEXT: [[DIV:%.+]] = sdiv i{{[0-9]+}} [[C_VAL]], 1
-// CHECK-NEXT: store i{{[0-9]+}} [[DIV]], i{{[0-9]+}}* [[C_PRIV]],
-// CHECK-NEXT: [[E_PRIV:%.+]] = load [4 x i{{[0-9]+}}]*, [4 x i{{[0-9]+}}]** [[REFE]],
-// CHECK-NEXT: [[E_PRIV_2:%.+]] = getelementptr inbounds [4 x i{{[0-9]+}}], [4 x i{{[0-9]+}}]* [[E_PRIV]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
-// CHECK-NEXT: store i32 1111, i32* [[E_PRIV_2]],
+// ALL-NEXT: [[A_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[A_PRIV]],
+// ALL-NEXT: [[INC:%.+]] = add nsw i{{[0-9]+}} [[A_VAL]], 1
+// ALL-NEXT: store i{{[0-9]+}} [[INC]], i{{[0-9]+}}* [[A_PRIV]],
+// ALL-64-NEXT: [[B_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[B_CONV]],
+// ALL-32-NEXT: [[B_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[B_PRIV]],
+// ALL-NEXT: [[DEC:%.+]] = add nsw i{{[0-9]+}} [[B_VAL]], -1
+// ALL-64-NEXT: store i{{[0-9]+}} [[DEC]], i{{[0-9]+}}* [[B_CONV]],
+// ALL-32-NEXT: store i{{[0-9]+}} [[DEC]], i{{[0-9]+}}* [[B_PRIV]],
+// ALL-NEXT: [[C_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFC]],
+// ALL-NEXT: [[C_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[C_PRIV]],
+// ALL-NEXT: [[DIV:%.+]] = sdiv i{{[0-9]+}} [[C_VAL]], 1
+// ALL-NEXT: store i{{[0-9]+}} [[DIV]], i{{[0-9]+}}* [[C_PRIV]],
+// ALL-NEXT: [[E_PRIV:%.+]] = load [4 x i{{[0-9]+}}]*, [4 x i{{[0-9]+}}]** [[REFE]],
+// ALL-NEXT: [[E_PRIV_2:%.+]] = getelementptr inbounds [4 x i{{[0-9]+}}], [4 x i{{[0-9]+}}]* [[E_PRIV]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// ALL-NEXT: store i32 1111, i32* [[E_PRIV_2]],
 // CHECK-NEXT: ret void
 
 // CHECK: define internal {{.*}}void [[TMAIN_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [2 x i32]* dereferenceable(8) %{{.+}}, i32* dereferenceable(4) %{{.+}}, [2 x [[S_INT_TY]]]* dereferenceable(8) %{{.+}}, [[S_INT_TY]]* dereferenceable(4) %{{.+}})
Index: clang/test/OpenMP/parallel_copyin_codegen.cpp
===================================================================
--- clang/test/OpenMP/parallel_copyin_codegen.cpp
+++ clang/test/OpenMP/parallel_copyin_codegen.cpp
@@ -1,9 +1,12 @@
-// RUN: %clang_cc1 -verify -fopenmp -fnoopenmp-use-tls -x c++ -triple x86_64-linux -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -fnoopenmp-use-tls -x c++ -triple x86_64-linux -emit-llvm %s -o - | FileCheck %s -check-prefixes=ALL,CHECK
 // RUN: %clang_cc1 -fopenmp -fnoopenmp-use-tls -x c++ -std=c++11 -triple x86_64-linux -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -fnoopenmp-use-tls -x c++ -triple x86_64-linux -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -fnoopenmp-use-tls -x c++ -triple x86_64-linux -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefixes=ALL,CHECK
 // RUN: %clang_cc1 -verify -fopenmp -fnoopenmp-use-tls -x c++ -std=c++11 -DLAMBDA -triple x86_64-linux -emit-llvm %s -o - | FileCheck -check-prefix=LAMBDA %s
 // RUN: %clang_cc1 -verify -fopenmp -fnoopenmp-use-tls -x c++ -fblocks -DBLOCKS -triple x86_64-linux -emit-llvm %s -o - | FileCheck -check-prefix=BLOCKS %s
 // RUN: %clang_cc1 -verify -fopenmp -fnoopenmp-use-tls -x c++ -std=c++11 -DARRAY -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=ARRAY %s
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-enable-irbuilder -fnoopenmp-use-tls -x c++ -triple x86_64-linux -emit-llvm %s -o - | FileCheck %s -check-prefixes=ALL,IRBUILDER
+// RUN: %clang_cc1 -fopenmp -fopenmp-enable-irbuilder -fnoopenmp-use-tls -x c++ -std=c++11 -triple x86_64-linux -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-enable-irbuilder -fnoopenmp-use-tls -x c++ -triple x86_64-linux -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefixes=ALL,IRBUILDER
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -fnoopenmp-use-tls -x c++ -triple x86_64-linux -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s
 // RUN: %clang_cc1 -fopenmp-simd -fnoopenmp-use-tls -x c++ -std=c++11 -triple x86_64-linux -emit-pch -o %t %s
@@ -46,21 +49,22 @@
   ~S() {}
 };
 
-// CHECK-DAG: [[S_FLOAT_TY:%.+]] = type { float }
-// CHECK-DAG: [[S_INT_TY:%.+]] = type { i{{[0-9]+}} }
+// ALL-DAG: [[S_FLOAT_TY:%.+]] = type { float }
+// ALL-DAG: [[S_INT_TY:%.+]] = type { i{{[0-9]+}} }
 // CHECK-DAG: [[IMPLICIT_BARRIER_LOC:@.+]] = private unnamed_addr global %{{.+}} { i32 0, i32 66, i32 0, i32 0, i8*
+// IRBUILDER-DAG: [[IMPLICIT_BARRIER_LOC:@.+]] = private unnamed_addr global %{{.+}} { i32 0, i32 34, i32 0, i32 0, i8*
 // TLS-CHECK-DAG: [[S_FLOAT_TY:%.+]] = type { float }
 // TLS-CHECK-DAG: [[S_INT_TY:%.+]] = type { i{{[0-9]+}} }
 // TLS-CHECK-DAG: [[IMPLICIT_BARRIER_LOC:@.+]] = private unnamed_addr global %{{.+}} { i32 0, i32 66, i32 0, i32 0, i8*
 
-// CHECK-DAG: [[T_VAR:@.+]] = internal global i{{[0-9]+}} 1122,
-// CHECK-DAG: [[VEC:@.+]] = internal global [2 x i{{[0-9]+}}] [i{{[0-9]+}} 1, i{{[0-9]+}} 2],
-// CHECK-DAG: [[S_ARR:@.+]] = internal global [2 x [[S_FLOAT_TY]]] zeroinitializer,
-// CHECK-DAG: [[VAR:@.+]] = internal global [[S_FLOAT_TY]] zeroinitializer,
-// CHECK-DAG: [[TMAIN_T_VAR:@.+]] = linkonce_odr {{(dso_local )?}}global i{{[0-9]+}} 333,
-// CHECK-DAG: [[TMAIN_VEC:@.+]] = linkonce_odr {{(dso_local )?}}global [2 x i{{[0-9]+}}] [i{{[0-9]+}} 3, i{{[0-9]+}} 3],
-// CHECK-DAG: [[TMAIN_S_ARR:@.+]] = linkonce_odr {{(dso_local )?}}global [2 x [[S_INT_TY]]] zeroinitializer,
-// CHECK-DAG: [[TMAIN_VAR:@.+]] = linkonce_odr {{(dso_local )?}}global [[S_INT_TY]] zeroinitializer,
+// ALL-DAG: [[T_VAR:@.+]] = internal global i{{[0-9]+}} 1122,
+// ALL-DAG: [[VEC:@.+]] = internal global [2 x i{{[0-9]+}}] [i{{[0-9]+}} 1, i{{[0-9]+}} 2],
+// ALL-DAG: [[S_ARR:@.+]] = internal global [2 x [[S_FLOAT_TY]]] zeroinitializer,
+// ALL-DAG: [[VAR:@.+]] = internal global [[S_FLOAT_TY]] zeroinitializer,
+// ALL-DAG: [[TMAIN_T_VAR:@.+]] = linkonce_odr {{(dso_local )?}}global i{{[0-9]+}} 333,
+// ALL-DAG: [[TMAIN_VEC:@.+]] = linkonce_odr {{(dso_local )?}}global [2 x i{{[0-9]+}}] [i{{[0-9]+}} 3, i{{[0-9]+}} 3],
+// ALL-DAG: [[TMAIN_S_ARR:@.+]] = linkonce_odr {{(dso_local )?}}global [2 x [[S_INT_TY]]] zeroinitializer,
+// ALL-DAG: [[TMAIN_VAR:@.+]] = linkonce_odr {{(dso_local )?}}global [[S_INT_TY]] zeroinitializer,
 // TLS-CHECK-DAG: [[T_VAR:@.+]] = internal thread_local global i{{[0-9]+}} 1122,
 // TLS-CHECK-DAG: [[VEC:@.+]] = internal thread_local global [2 x i{{[0-9]+}}] [i{{[0-9]+}} 1, i{{[0-9]+}} 2],
 // TLS-CHECK-DAG: [[S_ARR:@.+]] = internal thread_local global [2 x [[S_FLOAT_TY]]] zeroinitializer,
@@ -230,14 +234,14 @@
 #endif
 }
 
-// CHECK-LABEL: @main
-// CHECK: [[TEST:%.+]] = alloca [[S_FLOAT_TY]],
-// CHECK: call {{.*}} [[S_FLOAT_TY_COPY_ASSIGN:@.+]]([[S_FLOAT_TY]]* [[TEST]], [[S_FLOAT_TY]]*
-// CHECK: call {{.*}}void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 0, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*)* [[MAIN_MICROTASK:@.+]] to void (i32*, i32*, ...)*))
-// CHECK: call {{.*}}void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 0, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*)* [[MAIN_MICROTASK1:@.+]] to void (i32*, i32*, ...)*))
-// CHECK: = call {{.*}}i{{.+}} [[TMAIN_INT:@.+]]()
-// CHECK: call {{.*}} [[S_FLOAT_TY_DESTR:@.+]]([[S_FLOAT_TY]]*
-// CHECK: ret
+// ALL-LABEL: @main
+// ALL: [[TEST:%.+]] = alloca [[S_FLOAT_TY]],
+// ALL: call {{.*}} [[S_FLOAT_TY_COPY_ASSIGN:@.+]]([[S_FLOAT_TY]]* [[TEST]], [[S_FLOAT_TY]]*
+// ALL: call {{.*}}void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 0, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*)* [[MAIN_MICROTASK:@.+]] to void (i32*, i32*, ...)*))
+// ALL: call {{.*}}void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 0, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*)* [[MAIN_MICROTASK1:@.+]] to void (i32*, i32*, ...)*))
+// ALL: = call {{.*}}i{{.+}} [[TMAIN_INT:@.+]]()
+// ALL: call {{.*}} [[S_FLOAT_TY_DESTR:@.+]]([[S_FLOAT_TY]]*
+// ALL: ret
 
 // TLS-CHECK-LABEL: @main
 // TLS-CHECK: [[TEST:%.+]] = alloca [[S_FLOAT_TY]],
@@ -248,22 +252,56 @@
 // TLS-CHECK: call {{.*}} [[S_FLOAT_TY_DESTR:@.+]]([[S_FLOAT_TY]]*
 // TLS-CHECK: ret
 
-// CHECK: define internal {{.*}}void [[MAIN_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}})
+// IRBUILDER: define internal {{.*}}void [[MAIN_MICROTASK1]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}})
+// IRBUILDER: [[GTID_LOCAL:%.+]] = alloca i32
+// IRBUILDER: [[GTID_LD:%.+]] = load i32, i32* [[GTID_ADDR]]
+// IRBUILDER: store i32 [[GTID_LD]], i32* [[GTID_LOCAL]]
+// IRBUILDER: [[GTID:%.+]] = load i32, i32* [[GTID_LOCAL]]
+
+// threadprivate_t_var = t_var;
+// IRBUILDER: call {{.*}}i8* @__kmpc_threadprivate_cached({{.+}} [[T_VAR]]
+// IRBUILDER: ptrtoint i{{[0-9]+}}* %{{.+}} to i{{[0-9]+}}
+// IRBUILDER: icmp ne i{{[0-9]+}} ptrtoint (i{{[0-9]+}}* [[T_VAR]] to i{{[0-9]+}}), %{{.+}}
+// IRBUILDER: br i1 %{{.+}}, label %[[NOT_MASTER:.+]], label %[[DONE:.+]]
+
+// IRBUILDER: ret void
+
+// IRBUILDER: [[DONE]]
+// IRBUILDER: [[GTID_CALL:%.+]] = call i32 @__kmpc_global_thread_num(%{{.+}}* @{{.+}})
+// IRBUILDER: call {{.*}}void @__kmpc_barrier(%{{.+}}* [[IMPLICIT_BARRIER_LOC]], i32 [[GTID_CALL]])
+// IRBUILDER: add nsw i32 %{{.+}}, 1
+
+// IRBUILDER: [[NOT_MASTER]]
+// IRBUILDER: load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR]],
+// IRBUILDER: store i{{[0-9]+}} %{{.+}}, i{{[0-9]+}}* %{{.+}},
+
+// ALL: define internal {{.*}}void [[MAIN_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}})
 // CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
 // CHECK: [[GTID_ADDR:%.+]] = load i32*, i32** [[GTID_ADDR_ADDR]],
 // CHECK: [[GTID:%.+]] = load i32, i32* [[GTID_ADDR]],
 
+// IRBUILDER: [[GTID_LOCAL:%.+]] = alloca i32
+// IRBUILDER: [[GTID_LD:%.+]] = load i32, i32* [[GTID_ADDR]]
+// IRBUILDER: store i32 [[GTID_LD]], i32* [[GTID_LOCAL]]
+// IRBUILDER: [[GTID:%.+]] = load i32, i32* [[GTID_LOCAL]]
+
 // TLS-CHECK: define internal {{.*}}void [[MAIN_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}},
 // TLS-CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
 
 // threadprivate_t_var = t_var;
-// CHECK: call {{.*}}i8* @__kmpc_threadprivate_cached({{.+}} [[T_VAR]]
-// CHECK: ptrtoint i{{[0-9]+}}* %{{.+}} to i{{[0-9]+}}
-// CHECK: icmp ne i{{[0-9]+}} ptrtoint (i{{[0-9]+}}* [[T_VAR]] to i{{[0-9]+}}), %{{.+}}
-// CHECK: br i1 %{{.+}}, label %[[NOT_MASTER:.+]], label %[[DONE:.+]]
-// CHECK: [[NOT_MASTER]]
-// CHECK: load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR]],
-// CHECK: store i{{[0-9]+}} %{{.+}}, i{{[0-9]+}}* %{{.+}},
+// ALL: call {{.*}}i8* @__kmpc_threadprivate_cached({{.+}} [[T_VAR]]
+// ALL: ptrtoint i{{[0-9]+}}* %{{.+}} to i{{[0-9]+}}
+// ALL: icmp ne i{{[0-9]+}} ptrtoint (i{{[0-9]+}}* [[T_VAR]] to i{{[0-9]+}}), %{{.+}}
+// ALL: br i1 %{{.+}}, label %[[NOT_MASTER:.+]], label %[[DONE:.+]]
+
+// IRBUILDER: ret void
+// IRBUILDER: [[DONE]]
+// IRBUILDER: [[GTID_CALL:%.+]] = call i32 @__kmpc_global_thread_num(%{{.+}}* @{{.+}})
+// IRBUILDER: call {{.*}}void @__kmpc_barrier(%{{.+}}* [[IMPLICIT_BARRIER_LOC]], i32 [[GTID_CALL]])
+
+// ALL: [[NOT_MASTER]]
+// ALL: load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR]],
+// ALL: store i{{[0-9]+}} %{{.+}}, i{{[0-9]+}}* %{{.+}},
 
 // TLS-CHECK: [[MASTER_REF:%.+]] = load i32*, i32** %
 // TLS-CHECK: [[MASTER_REF2:%.+]] = load [2 x i32]*, [2 x i32]** %
@@ -278,21 +316,21 @@
 // TLS-CHECK: store i32 [[MASTER_VAL]], i32* [[T_VAR]]
 
 // threadprivate_vec = vec;
-// CHECK: call {{.*}}i8* @__kmpc_threadprivate_cached({{.+}} [[VEC]]
-// CHECK: call void @llvm.memcpy{{.*}}(i8* align {{[0-9]+}}  %{{.+}}, i8* align {{[0-9]+}} bitcast ([2 x i{{[0-9]+}}]* [[VEC]] to i8*),
+// ALL: call {{.*}}i8* @__kmpc_threadprivate_cached({{.+}} [[VEC]]
+// ALL: call void @llvm.memcpy{{.*}}(i8* align {{[0-9]+}}  %{{.+}}, i8* align {{[0-9]+}} bitcast ([2 x i{{[0-9]+}}]* [[VEC]] to i8*),
 
 // TLS-CHECK: [[MASTER_CAST:%.+]] = bitcast [2 x i32]* [[MASTER_REF2]] to i8*
 // TLS-CHECK: call void @llvm.memcpy{{.*}}(i8* align {{[0-9]+}} bitcast ([2 x i{{[0-9]+}}]* [[VEC]] to i8*), i8* align {{[0-9]+}} [[MASTER_CAST]]
 
 // threadprivate_s_arr = s_arr;
-// CHECK: call {{.*}}i8* @__kmpc_threadprivate_cached({{.+}} [[S_ARR]]
-// CHECK: [[S_ARR_PRIV_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* {{%.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
-// CHECK: [[S_ARR_PRIV_END:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_PRIV_BEGIN]], i{{[0-9]+}} 2
-// CHECK: [[IS_EMPTY:%.+]] = icmp eq [[S_FLOAT_TY]]* [[S_ARR_PRIV_BEGIN]], [[S_ARR_PRIV_END]]
-// CHECK: br i1 [[IS_EMPTY]], label %[[S_ARR_BODY_DONE:.+]], label %[[S_ARR_BODY:.+]]
-// CHECK: [[S_ARR_BODY]]
-// CHECK: call {{.*}} [[S_FLOAT_TY_COPY_ASSIGN]]([[S_FLOAT_TY]]* {{.+}}, [[S_FLOAT_TY]]* {{.+}})
-// CHECK: br i1 {{.+}}, label %{{.+}}, label %[[S_ARR_BODY]]
+// ALL: call {{.*}}i8* @__kmpc_threadprivate_cached({{.+}} [[S_ARR]]
+// ALL: [[S_ARR_PRIV_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* {{%.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// ALL: [[S_ARR_PRIV_END:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_PRIV_BEGIN]], i{{[0-9]+}} 2
+// ALL: [[IS_EMPTY:%.+]] = icmp eq [[S_FLOAT_TY]]* [[S_ARR_PRIV_BEGIN]], [[S_ARR_PRIV_END]]
+// ALL: br i1 [[IS_EMPTY]], label %[[S_ARR_BODY_DONE:.+]], label %[[S_ARR_BODY:.+]]
+// ALL: [[S_ARR_BODY]]
+// ALL: call {{.*}} [[S_FLOAT_TY_COPY_ASSIGN]]([[S_FLOAT_TY]]* {{.+}}, [[S_FLOAT_TY]]* {{.+}})
+// ALL: br i1 {{.+}}, label %{{.+}}, label %[[S_ARR_BODY]]
 
 // TLS-CHECK: [[MASTER_CAST:%.+]] = bitcast [2 x [[S_FLOAT_TY]]]* [[MASTER_REF3]] to [[S_FLOAT_TY]]*
 // TLS-CHECK-DAG: [[S_ARR_SRC_BEGIN:%.+]] = phi [[S_FLOAT_TY]]* {{.*}}[[MASTER_CAST]]
@@ -305,8 +343,8 @@
 // TLS-CHECK: [[ARR_DONE]]
 
 // threadprivate_var = var;
-// CHECK: call {{.*}}i8* @__kmpc_threadprivate_cached({{.+}} [[VAR]]
-// CHECK: call {{.*}} [[S_FLOAT_TY_COPY_ASSIGN]]([[S_FLOAT_TY]]* {{%.+}}, [[S_FLOAT_TY]]* {{.*}}[[VAR]])
+// ALL: call {{.*}}i8* @__kmpc_threadprivate_cached({{.+}} [[VAR]]
+// ALL: call {{.*}} [[S_FLOAT_TY_COPY_ASSIGN]]([[S_FLOAT_TY]]* {{%.+}}, [[S_FLOAT_TY]]* {{.*}}[[VAR]])
 // CHECK: [[DONE]]
 
 // TLS-CHECK: call {{.*}} [[S_FLOAT_TY_COPY_ASSIGN]]([[S_FLOAT_TY]]* {{.*}}[[VAR]], [[S_FLOAT_TY]]* {{.*}}[[MASTER_REF4]])
@@ -356,36 +394,67 @@
 // TLS-CHECK: call {{.*}}void @__kmpc_barrier(%{{.+}}* [[IMPLICIT_BARRIER_LOC]], i32 [[GTID]])
 // TLS-CHECK: ret void
 
-// CHECK: define {{.*}} i{{[0-9]+}} [[TMAIN_INT]]()
-// CHECK: [[TEST:%.+]] = alloca [[S_INT_TY]],
-// CHECK: call {{.*}} [[S_INT_TY_COPY_ASSIGN:@.+]]([[S_INT_TY]]* [[TEST]], [[S_INT_TY]]*
-// CHECK: call {{.*}}void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 0, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*)* [[TMAIN_MICROTASK:@.+]] to void (i32*, i32*, ...)*))
-// CHECK: call {{.*}}void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 0, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*)* [[TMAIN_MICROTASK1:@.+]] to void (i32*, i32*, ...)*))
-// CHECK: call {{.*}} [[S_INT_TY_DESTR:@.+]]([[S_INT_TY]]*
-// CHECK: ret
+// ALL: define {{.*}} i{{[0-9]+}} [[TMAIN_INT]]()
+// ALL: [[TEST:%.+]] = alloca [[S_INT_TY]],
+// ALL: call {{.*}} [[S_INT_TY_COPY_ASSIGN:@.+]]([[S_INT_TY]]* [[TEST]], [[S_INT_TY]]*
+// ALL: call {{.*}}void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 0, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*)* [[TMAIN_MICROTASK:@.+]] to void (i32*, i32*, ...)*))
+// ALL: call {{.*}}void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 0, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*)* [[TMAIN_MICROTASK1:@.+]] to void (i32*, i32*, ...)*))
+// ALL: call {{.*}} [[S_INT_TY_DESTR:@.+]]([[S_INT_TY]]*
+// ALL: ret
 
 // TLS-CHECK: define {{.*}} i{{[0-9]+}} [[TMAIN_INT]]()
 // TLS-CHECK: [[TEST:%.+]] = alloca [[S_INT_TY]],
 // TLS-CHECK: call {{.*}} [[S_INT_TY_COPY_ASSIGN:@.+]]([[S_INT_TY]]* [[TEST]], [[S_INT_TY]]*
 // TLS-CHECK:     call {{.*}}void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 4, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, i32*, [2 x i32]*, [2 x [[S_INT_TY]]]*, [[S_INT_TY]]*)* [[TMAIN_MICROTASK:@.+]] to void (i32*, i32*, ...)*),
 // TLS-CHECK:     call {{.*}}void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, i32*)* [[TMAIN_MICROTASK1:@.+]] to void (i32*, i32*, ...)*),
+
+// IRBUILDER: define internal {{.*}}void [[TMAIN_MICROTASK1]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}})
+// IRBUILDER: [[GTID_LOCAL:%.+]] = alloca i32
+// IRBUILDER: [[GTID_LD:%.+]] = load i32, i32* [[GTID_ADDR]]
+// IRBUILDER: store i32 [[GTID_LD]], i32* [[GTID_LOCAL]]
+// IRBUILDER: [[GTID:%.+]] = load i32, i32* [[GTID_LOCAL]]
+// threadprivate_t_var = t_var;
+// IRBUILDER: call {{.*}}i8* @__kmpc_threadprivate_cached({{.+}} [[TMAIN_T_VAR]]
+// IRBUILDER: ptrtoint i{{[0-9]+}}* %{{.+}} to i{{[0-9]+}}
+// IRBUILDER: icmp ne i{{[0-9]+}} ptrtoint (i{{[0-9]+}}* [[TMAIN_T_VAR]] to i{{[0-9]+}}), %{{.+}}
+// IRBUILDER: br i1 %{{.+}}, label %[[NOT_MASTER:.+]], label %[[DONE:.+]]
+// IRBUILDER: ret void
+// IRBUILDER: [[DONE]]
+// IRBUILDER: [[GTID_CALL:%.+]] = call i32 @__kmpc_global_thread_num(%{{.+}}* @{{.+}})
+// IRBUILDER: call {{.*}}void @__kmpc_barrier(%{{.+}}* [[IMPLICIT_BARRIER_LOC]], i32 [[GTID_CALL]])
+// IRBUILDER: [[NOT_MASTER]]
+// IRBUILDER: load i{{[0-9]+}}, i{{[0-9]+}}* [[TMAIN_T_VAR]],
+// IRBUILDER: store i{{[0-9]+}} %{{.+}}, i{{[0-9]+}}* %{{.+}},
+
 //
-// CHECK: define internal {{.*}}void [[TMAIN_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}})
+// ALL: define internal {{.*}}void [[TMAIN_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}})
 // CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
 // CHECK: [[GTID_ADDR:%.+]] = load i32*, i32** [[GTID_ADDR_ADDR]],
 // CHECK: [[GTID:%.+]] = load i32, i32* [[GTID_ADDR]],
+
+// IRBUILDER: [[GTID_LOCAL:%.+]] = alloca i32
+// IRBUILDER: [[GTID_LD:%.+]] = load i32, i32* [[GTID_ADDR]]
+// IRBUILDER: store i32 [[GTID_LD]], i32* [[GTID_LOCAL]]
+// IRBUILDER: [[GTID:%.+]] = load i32, i32* [[GTID_LOCAL]]
 //
 // TLS-CHECK: define internal {{.*}}void [[TMAIN_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}})
 // TLS-CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
 
 // threadprivate_t_var = t_var;
-// CHECK: call {{.*}}i8* @__kmpc_threadprivate_cached({{.+}} [[TMAIN_T_VAR]]
-// CHECK: ptrtoint i{{[0-9]+}}* %{{.+}} to i{{[0-9]+}}
-// CHECK: icmp ne i{{[0-9]+}} ptrtoint (i{{[0-9]+}}* [[TMAIN_T_VAR]] to i{{[0-9]+}}), %{{.+}}
-// CHECK: br i1 %{{.+}}, label %[[NOT_MASTER:.+]], label %[[DONE:.+]]
-// CHECK: [[NOT_MASTER]]
-// CHECK: load i{{[0-9]+}}, i{{[0-9]+}}* [[TMAIN_T_VAR]], align 128
-// CHECK: store i{{[0-9]+}} %{{.+}}, i{{[0-9]+}}* %{{.+}}, align 128
+// ALL: call {{.*}}i8* @__kmpc_threadprivate_cached({{.+}} [[TMAIN_T_VAR]]
+// ALL: ptrtoint i{{[0-9]+}}* %{{.+}} to i{{[0-9]+}}
+// ALL: icmp ne i{{[0-9]+}} ptrtoint (i{{[0-9]+}}* [[TMAIN_T_VAR]] to i{{[0-9]+}}), %{{.+}}
+// ALL: br i1 %{{.+}}, label %[[NOT_MASTER:.+]], label %[[DONE:.+]]
+
+// IRBUILDER: ret void
+
+// IRBUILDER: [[DONE]]
+// IRBUILDER: [[GTID_CALL:%.+]] = call i32 @__kmpc_global_thread_num(%{{.+}}* @{{.+}})
+// IRBUILDER: call {{.*}}void @__kmpc_barrier(%{{.+}}* [[IMPLICIT_BARRIER_LOC]], i32 [[GTID_CALL]])
+
+// ALL: [[NOT_MASTER]]
+// ALL: load i{{[0-9]+}}, i{{[0-9]+}}* [[TMAIN_T_VAR]], align 128
+// ALL: store i{{[0-9]+}} %{{.+}}, i{{[0-9]+}}* %{{.+}}, align 128
 
 // TLS-CHECK: [[MASTER_REF:%.+]] = load i32*, i32** %
 // TLS-CHECK: [[MASTER_REF1:%.+]] = load [2 x i32]*, [2 x i32]** %
@@ -400,21 +469,21 @@
 // TLS-CHECK: store i32 [[MASTER_VAL]], i32* [[TMAIN_T_VAR]], align 128
 
 // threadprivate_vec = vec;
-// CHECK: call {{.*}}i8* @__kmpc_threadprivate_cached({{.+}} [[TMAIN_VEC]]
-// CHECK: call {{.*}}void @llvm.memcpy{{.*}}(i8* align {{[0-9]+}} %{{.+}}, i8* align {{[0-9]+}} bitcast ([2 x i{{[0-9]+}}]* [[TMAIN_VEC]] to i8*),
+// ALL: call {{.*}}i8* @__kmpc_threadprivate_cached({{.+}} [[TMAIN_VEC]]
+// ALL: call {{.*}}void @llvm.memcpy{{.*}}(i8* align {{[0-9]+}} %{{.+}}, i8* align {{[0-9]+}} bitcast ([2 x i{{[0-9]+}}]* [[TMAIN_VEC]] to i8*),
 
 // TLS-CHECK: [[MASTER_CAST:%.+]] = bitcast [2 x i32]* [[MASTER_REF1]] to i8*
 // TLS-CHECK: call void @llvm.memcpy{{.*}}(i8* align {{[0-9]+}} bitcast ([2 x i{{[0-9]+}}]* [[TMAIN_VEC]] to i8*), i8* align {{[0-9]+}} [[MASTER_CAST]]
 
 // threadprivate_s_arr = s_arr;
-// CHECK: call {{.*}}i8* @__kmpc_threadprivate_cached({{.+}} [[TMAIN_S_ARR]]
-// CHECK: [[S_ARR_PRIV_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* {{%.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
-// CHECK: [[S_ARR_PRIV_END:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_PRIV_BEGIN]], i{{[0-9]+}} 2
-// CHECK: [[IS_EMPTY:%.+]] = icmp eq [[S_INT_TY]]* [[S_ARR_PRIV_BEGIN]], [[S_ARR_PRIV_END]]
-// CHECK: br i1 [[IS_EMPTY]], label %[[S_ARR_BODY_DONE:.+]], label %[[S_ARR_BODY:.+]]
-// CHECK: [[S_ARR_BODY]]
-// CHECK: call {{.*}} [[S_INT_TY_COPY_ASSIGN]]([[S_INT_TY]]* {{.+}}, [[S_INT_TY]]* {{.+}})
-// CHECK: br i1 {{.+}}, label %{{.+}}, label %[[S_ARR_BODY]]
+// ALL: call {{.*}}i8* @__kmpc_threadprivate_cached({{.+}} [[TMAIN_S_ARR]]
+// ALL: [[S_ARR_PRIV_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* {{%.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// ALL: [[S_ARR_PRIV_END:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_PRIV_BEGIN]], i{{[0-9]+}} 2
+// ALL: [[IS_EMPTY:%.+]] = icmp eq [[S_INT_TY]]* [[S_ARR_PRIV_BEGIN]], [[S_ARR_PRIV_END]]
+// ALL: br i1 [[IS_EMPTY]], label %[[S_ARR_BODY_DONE:.+]], label %[[S_ARR_BODY:.+]]
+// ALL: [[S_ARR_BODY]]
+// ALL: call {{.*}} [[S_INT_TY_COPY_ASSIGN]]([[S_INT_TY]]* {{.+}}, [[S_INT_TY]]* {{.+}})
+// ALL: br i1 {{.+}}, label %{{.+}}, label %[[S_ARR_BODY]]
 
 // TLS-CHECK: [[MASTER_CAST:%.+]] = bitcast [2 x [[S_INT_TY]]]* [[MASTER_REF2]] to [[S_INT_TY]]*
 // TLS-CHECK-DAG: [[S_ARR_SRC_BEGIN:%.+]] = phi [[S_INT_TY]]* {{.*}}[[MASTER_CAST]]
@@ -427,8 +496,8 @@
 // TLS-CHECK: [[ARR_DONE]]
 
 // threadprivate_var = var;
-// CHECK: call {{.*}}i8* @__kmpc_threadprivate_cached({{.+}} [[TMAIN_VAR]]
-// CHECK: call {{.*}} [[S_INT_TY_COPY_ASSIGN]]([[S_INT_TY]]* {{%.+}}, [[S_INT_TY]]* {{.*}}[[TMAIN_VAR]])
+// ALL: call {{.*}}i8* @__kmpc_threadprivate_cached({{.+}} [[TMAIN_VAR]]
+// ALL: call {{.*}} [[S_INT_TY_COPY_ASSIGN]]([[S_INT_TY]]* {{%.+}}, [[S_INT_TY]]* {{.*}}[[TMAIN_VAR]])
 // CHECK: [[DONE]]
 
 // TLS-CHECK: call {{.*}} [[S_INT_TY_COPY_ASSIGN]]([[S_INT_TY]]* {{.*}}[[TMAIN_VAR]], [[S_INT_TY]]* {{.*}}[[MASTER_REF3]])
Index: clang/lib/CodeGen/CodeGenFunction.h
===================================================================
--- clang/lib/CodeGen/CodeGenFunction.h
+++ clang/lib/CodeGen/CodeGenFunction.h
@@ -1555,6 +1555,14 @@
 
     using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
 
+    struct CapturedVarInfo {
+
+      enum CapturedVarKind { ByValue, ByRef };
+
+      llvm::Value *PassedValue;
+      CapturedVarKind CapturedKind;
+    };
+
     /// Cleanup action for allocate support.
     class OMPAllocateCleanupTy final : public EHScopeStack::Cleanup {
 
@@ -1596,6 +1604,34 @@
     static Address getAddressOfLocalVariable(CodeGenFunction &CGF,
                                              const VarDecl *VD);
 
+    /// Emit first private clause
+    ///
+    /// \param CGF CodeGenFunction for function containing the OMP directive
+    /// this is associated with \param D The directive the firstprivate is
+    /// associated with \param PrivateScope for all captured variables in
+    /// current associated directive \param CapturedVarsInfoMap Map of captured
+    /// variables and generated associated values
+    ///
+    /// \return True if any variable were generated
+    static bool EmitOMPFirstprivateClause(
+        CodeGenFunction &CGF, const OMPExecutableDirective &D,
+        OMPPrivateScope &PrivateScope,
+        llvm::SmallDenseMap<const VarDecl *, CapturedVarInfo>
+            &CapturedVarsInfoMap);
+
+    static bool EmitOMPCopyinClause(CodeGenFunction &CGF,
+                                    const OMPExecutableDirective &D,
+                                    InsertPointTy AllocaIP);
+
+    /// Create specialized alloca to handle lastprivate conditionals.
+    static Address emitLastprivateConditionalInit(CodeGenFunction &CGF,
+                                                  const VarDecl *VD);
+
+    static void GenerateOpenMPCapturedVars(
+        CodeGenFunction &CFG, const CapturedStmt &S,
+        llvm::SmallDenseMap<const VarDecl *, CapturedVarInfo>
+            &CapturedVarsInfoMap);
+
     /// Get the platform-specific name separator.
     /// \param Parts different parts of the final name that needs separation
     /// \param FirstSeparator First separator used between the initial two
@@ -1711,6 +1747,14 @@
   };
 
 private:
+  /// Maps local variables marked as lastprivate conditional to their internal
+  /// types.
+  llvm::DenseMap<llvm::Function *,
+                 llvm::DenseMap<CanonicalDeclPtr<const Decl>,
+                                std::tuple<QualType, const FieldDecl *,
+                                           const FieldDecl *, LValue>>>
+      LastprivateConditionalToTypes;
+
   /// CXXThisDecl - When generating code for a C++ member function,
   /// this will hold the implicit 'this' declaration.
   ImplicitParamDecl *CXXABIThisDecl = nullptr;
Index: clang/lib/CodeGen/CGStmtOpenMP.cpp
===================================================================
--- clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -795,14 +795,8 @@
       if (DeviceConstTarget && OrigVD->getType().isConstant(getContext()) &&
           FD && FD->getType()->isReferenceType() &&
           (!VD || !VD->hasAttr<OMPAllocateDeclAttr>())) {
-        if (OMPBuilder)
-          // TODO: Move and modify this function based on target regions after
-          // they land
-          (void)CGM.getOpenMPRuntime().registerTargetFirstprivateCopy(*this,
-                                                                      OrigVD);
-        else
-          (void)CGM.getOpenMPRuntime().registerTargetFirstprivateCopy(*this,
-                                                                      OrigVD);
+        (void)CGM.getOpenMPRuntime().registerTargetFirstprivateCopy(*this,
+                                                                    OrigVD);
         ++IRef;
         ++InitsRef;
         continue;
@@ -886,9 +880,11 @@
                     Lastprivates[OrigVD->getCanonicalDecl()] ==
                         OMPC_LASTPRIVATE_conditional) {
                   // Create/init special variable for lastprivate conditionals.
-                  Address VDAddr =
+                  Address VDAddr = Address::invalid();
+                  VDAddr =
                       CGM.getOpenMPRuntime().emitLastprivateConditionalInit(
                           *this, OrigVD);
+
                   llvm::Value *V = EmitLoadOfScalar(
                       MakeAddrLValue(GetAddrOfLocalVar(VD), (*IRef)->getType(),
                                      AlignmentSource::Decl),
@@ -1438,6 +1434,138 @@
                                      const OMPExecutableDirective &,
                                      llvm::SmallVectorImpl<llvm::Value *> &) {}
 
+static FieldDecl *addFieldToRecordDecl(ASTContext &C, DeclContext *DC,
+                                       QualType FieldTy) {
+  auto *Field = FieldDecl::Create(
+      C, DC, SourceLocation(), SourceLocation(), /*Id=*/nullptr, FieldTy,
+      C.getTrivialTypeSourceInfo(FieldTy, SourceLocation()),
+      /*BW=*/nullptr, /*Mutable=*/false, /*InitStyle=*/ICIS_NoInit);
+  Field->setAccess(AS_public);
+  DC->addDecl(Field);
+  return Field;
+}
+
+Address CodeGenFunction::OMPBuilderCBHelpers::emitLastprivateConditionalInit(
+    CodeGenFunction &CGF, const VarDecl *VD) {
+  ASTContext &C = CGF.CGM.getContext();
+  auto I = CGF.LastprivateConditionalToTypes.find(CGF.CurFn);
+  if (I == CGF.LastprivateConditionalToTypes.end())
+    I = CGF.LastprivateConditionalToTypes.try_emplace(CGF.CurFn).first;
+  QualType NewType;
+  const FieldDecl *VDField;
+  const FieldDecl *FiredField;
+  LValue BaseLVal;
+  auto VI = I->getSecond().find(VD);
+  if (VI == I->getSecond().end()) {
+    RecordDecl *RD = C.buildImplicitRecord("lasprivate.conditional");
+    RD->startDefinition();
+    VDField = addFieldToRecordDecl(C, RD, VD->getType().getNonReferenceType());
+    FiredField = addFieldToRecordDecl(C, RD, C.CharTy);
+    RD->completeDefinition();
+    NewType = C.getRecordType(RD);
+    Address Addr =
+        CGF.CreateMemTemp(NewType, C.getDeclAlign(VD), VD->getName());
+    BaseLVal = CGF.MakeAddrLValue(Addr, NewType, AlignmentSource::Decl);
+    I->getSecond().try_emplace(VD, NewType, VDField, FiredField, BaseLVal);
+  } else {
+    NewType = std::get<0>(VI->getSecond());
+    VDField = std::get<1>(VI->getSecond());
+    FiredField = std::get<2>(VI->getSecond());
+    BaseLVal = std::get<3>(VI->getSecond());
+  }
+  LValue FiredLVal = CGF.EmitLValueForField(BaseLVal, FiredField);
+  CGF.EmitStoreOfScalar(
+      llvm::ConstantInt::getNullValue(CGF.ConvertTypeForMem(C.CharTy)),
+      FiredLVal);
+  return CGF.EmitLValueForField(BaseLVal, VDField).getAddress(CGF);
+}
+
+bool CodeGenFunction::OMPBuilderCBHelpers::EmitOMPCopyinClause(
+    CodeGenFunction &CGF, const OMPExecutableDirective &D,
+    InsertPointTy AllocaIP) {
+  if (!CGF.HaveInsertPoint())
+    return false;
+  // threadprivate_var1 = master_threadprivate_var1;
+  // operator=(threadprivate_var2, master_threadprivate_var2);
+  // ...
+  // __kmpc_barrier(&loc, global_tid);
+  llvm::OpenMPIRBuilder *OMPBuilder = CGF.CGM.getOpenMPIRBuilder();
+  llvm::DenseSet<const VarDecl *> CopiedVars;
+  llvm::BasicBlock *CopyBegin = nullptr, *CopyEnd = nullptr;
+  for (const auto *C : D.template getClausesOfKind<OMPCopyinClause>()) {
+    auto IRef = C->varlist_begin();
+    auto ISrcRef = C->source_exprs().begin();
+    auto IDestRef = C->destination_exprs().begin();
+    for (const Expr *AssignOp : C->assignment_ops()) {
+      const auto *VD = cast<VarDecl>(cast<DeclRefExpr>(*IRef)->getDecl());
+      QualType Type = VD->getType();
+      if (CopiedVars.insert(VD->getCanonicalDecl()).second) {
+        // Get the address of the master variable. If we are emitting code with
+        // TLS support, the address is passed from the master as field in the
+        // captured declaration.
+        Address MasterAddr = Address::invalid();
+        if (CGF.getLangOpts().OpenMPUseTLS &&
+            CGF.getContext().getTargetInfo().isTLSSupported()) {
+          assert(CGF.CapturedStmtInfo->lookup(VD) &&
+                 "Copyin threadprivates should have been captured!");
+          const auto *VDCanon = VD->getCanonicalDecl();
+          auto I = CGF.LocalDeclMap.find(VDCanon);
+          if (I == CGF.LocalDeclMap.end()) {
+            Address Addr(CGF.CGM.GetAddrOfGlobal(VDCanon),
+                         CGF.getContext().getDeclAlign(VDCanon));
+            CGF.LocalDeclMap.try_emplace(VDCanon, Addr);
+          }
+          DeclRefExpr DRE(CGF.getContext(), const_cast<VarDecl *>(VD), true,
+                          (*IRef)->getType(), VK_LValue, (*IRef)->getExprLoc());
+          MasterAddr = CGF.EmitLValue(&DRE).getAddress(CGF);
+          CGF.LocalDeclMap.erase(VDCanon);
+        } else {
+          MasterAddr = Address(VD->isStaticLocal()
+                                   ? CGF.CGM.getStaticLocalDeclAddress(VD)
+                                   : CGF.CGM.GetAddrOfGlobal(VD),
+                               CGF.getContext().getDeclAlign(VD));
+        }
+
+        // Get the address of the threadprivate variable.
+        Address PrivateAddr = CGF.EmitLValue(*IRef).getAddress(CGF);
+        if (CopiedVars.size() == 1) {
+          // At first check if current thread is a master thread. If it is, no
+          // need to copy data.
+          InsertPointTy CopyBeginIP = OMPBuilder->CreateCopyinClauseBlocks(
+              AllocaIP, MasterAddr.getPointer(), PrivateAddr.getPointer(),
+              CGF.IntPtrTy, /*BranchtoEnd*/ false);
+          CGF.Builder.restoreIP(CopyBeginIP);
+          CopyBegin = CopyBeginIP.getBlock();
+          assert(CopyBegin && "CopyIn Basic Block was not generated!");
+          llvm::BranchInst *EntryCBI = llvm::dyn_cast_or_null<llvm::BranchInst>(
+              AllocaIP.getBlock()->getTerminator());
+          CopyEnd = EntryCBI ? EntryCBI->getSuccessor(1) : nullptr;
+          assert(CopyEnd && "No unique successor for CopyIn Basic Block!");
+        }
+        const auto *SrcVD =
+            cast<VarDecl>(cast<DeclRefExpr>(*ISrcRef)->getDecl());
+        const auto *DestVD =
+            cast<VarDecl>(cast<DeclRefExpr>(*IDestRef)->getDecl());
+        CGF.EmitOMPCopy(Type, PrivateAddr, MasterAddr, DestVD, SrcVD, AssignOp);
+      }
+      ++IRef;
+      ++ISrcRef;
+      ++IDestRef;
+    }
+  }
+  if (CopyEnd) {
+    // Exit out of copying procedure for non-master thread.
+    CGF.EmitBranch(CopyEnd);
+    if (llvm::Instruction *CopyEndTI =
+            CopyEnd ? CopyEnd->getTerminator() : nullptr)
+      CGF.Builder.SetInsertPoint(CopyEndTI);
+    else
+      CGF.Builder.SetInsertPoint(CopyEnd);
+    return true;
+  }
+  return false;
+}
+
 Address CodeGenFunction::OMPBuilderCBHelpers::getAddressOfLocalVariable(
     CodeGenFunction &CGF, const VarDecl *VD) {
   CodeGenModule &CGM = CGF.CGM;
@@ -1528,6 +1656,293 @@
   }
   return OS.str().str();
 }
+
+bool CodeGenFunction::OMPBuilderCBHelpers::EmitOMPFirstprivateClause(
+    CodeGenFunction &CGF, const OMPExecutableDirective &D,
+    OMPPrivateScope &PrivateScope,
+    llvm::SmallDenseMap<const VarDecl *, CapturedVarInfo>
+        &CapturedVarsInfoMap) {
+  if (!CGF.HaveInsertPoint())
+    return false;
+
+  CodeGenModule &CGM = CGF.CGM;
+  bool DeviceConstTarget =
+      CGF.getLangOpts().OpenMPIsDevice &&
+      isOpenMPTargetExecutionDirective(D.getDirectiveKind());
+  bool FirstprivateIsLastprivate = false;
+  llvm::DenseMap<const VarDecl *, OpenMPLastprivateModifier> Lastprivates;
+  for (const auto *C : D.getClausesOfKind<OMPLastprivateClause>()) {
+    for (const auto *D : C->varlists())
+      Lastprivates.try_emplace(
+          cast<VarDecl>(cast<DeclRefExpr>(D)->getDecl())->getCanonicalDecl(),
+          C->getKind());
+  }
+  llvm::DenseSet<const VarDecl *> EmittedAsFirstprivate;
+  llvm::SmallVector<OpenMPDirectiveKind, 4> CaptureRegions;
+  getOpenMPCaptureRegions(CaptureRegions, D.getDirectiveKind());
+  // Force emission of the firstprivate copy if the directive does not emit
+  // outlined function, like omp for, omp simd, omp distribute etc.
+  bool MustEmitFirstprivateCopy =
+      CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown;
+  for (const auto *C : D.getClausesOfKind<OMPFirstprivateClause>()) {
+    const auto *IRef = C->varlist_begin();
+    const auto *InitsRef = C->inits().begin();
+    for (const Expr *IInit : C->private_copies()) {
+      const auto *OrigVD = cast<VarDecl>(cast<DeclRefExpr>(*IRef)->getDecl());
+      bool ThisFirstprivateIsLastprivate =
+          Lastprivates.count(OrigVD->getCanonicalDecl()) > 0;
+      const FieldDecl *FD = CGF.CapturedStmtInfo->lookup(OrigVD);
+      const auto *VD = cast<VarDecl>(cast<DeclRefExpr>(IInit)->getDecl());
+      if (!MustEmitFirstprivateCopy && !ThisFirstprivateIsLastprivate && FD &&
+          !FD->getType()->isReferenceType() &&
+          (!VD || !VD->hasAttr<OMPAllocateDeclAttr>())) {
+        if (CapturedVarsInfoMap[OrigVD].CapturedKind !=
+                CapturedVarInfo::ByValue ||
+            OrigVD->isConstexpr()) {
+          EmittedAsFirstprivate.insert(OrigVD->getCanonicalDecl());
+          ++IRef;
+          ++InitsRef;
+          continue;
+        }
+      }
+      // Do not emit copy for firstprivate constant variables in target regions,
+      // captured by reference.
+      if (DeviceConstTarget && OrigVD->getType().isConstant(CGF.getContext()) &&
+          FD && FD->getType()->isReferenceType() &&
+          (!VD || !VD->hasAttr<OMPAllocateDeclAttr>())) {
+        // TODO: Move and modify this function based on target regions after
+        // they land
+        (void)CGM.getOpenMPRuntime().registerTargetFirstprivateCopy(CGF,
+                                                                    OrigVD);
+        ++IRef;
+        ++InitsRef;
+        continue;
+      }
+      FirstprivateIsLastprivate =
+          FirstprivateIsLastprivate || ThisFirstprivateIsLastprivate;
+      if (EmittedAsFirstprivate.insert(OrigVD->getCanonicalDecl()).second) {
+        const auto *VDInit =
+            cast<VarDecl>(cast<DeclRefExpr>(*InitsRef)->getDecl());
+        bool IsRegistered;
+        DeclRefExpr DRE(CGF.getContext(), const_cast<VarDecl *>(OrigVD),
+                        /*RefersToEnclosingVariableOrCapture=*/FD != nullptr,
+                        (*IRef)->getType(), VK_LValue, (*IRef)->getExprLoc());
+        LValue OriginalLVal;
+        if (!FD) {
+          // Check if the firstprivate variable is just a constant value.
+          ConstantEmission CE = CGF.tryEmitAsConstant(&DRE);
+          if (CE && !CE.isReference()) {
+            // Constant value, no need to create a copy.
+            ++IRef;
+            ++InitsRef;
+            continue;
+          }
+          if (CE && CE.isReference()) {
+            OriginalLVal = CE.getReferenceLValue(CGF, &DRE);
+          } else {
+            assert(!CE && "Expected non-constant firstprivate.");
+            OriginalLVal = CGF.EmitLValue(&DRE);
+          }
+        } else {
+          OriginalLVal = CGF.EmitLValue(&DRE);
+        }
+        QualType Type = VD->getType();
+        if (Type->isArrayType()) {
+          // Emit VarDecl with copy init for arrays.
+          // Get the address of the original variable captured in current
+          // captured region.
+          IsRegistered = PrivateScope.addPrivate(
+              OrigVD, [&CGF, VD, Type, OriginalLVal, VDInit]() {
+                AutoVarEmission Emission = CGF.EmitAutoVarAlloca(*VD);
+                const Expr *Init = VD->getInit();
+                if (!isa<CXXConstructExpr>(Init) ||
+                    CGF.isTrivialInitializer(Init)) {
+                  // Perform simple memcpy.
+                  LValue Dest =
+                      CGF.MakeAddrLValue(Emission.getAllocatedAddress(), Type);
+                  CGF.EmitAggregateAssign(Dest, OriginalLVal, Type);
+                } else {
+                  CGF.EmitOMPAggregateAssign(
+                      Emission.getAllocatedAddress(),
+                      OriginalLVal.getAddress(CGF), Type,
+                      [&CGF, VDInit, Init](Address DestElement,
+                                           Address SrcElement) {
+                        // Clean up any temporaries needed by the
+                        // initialization.
+                        RunCleanupsScope InitScope(CGF);
+                        // Emit initialization for single element.
+                        CGF.setAddrOfLocalVar(VDInit, SrcElement);
+                        CGF.EmitAnyExprToMem(Init, DestElement,
+                                             Init->getType().getQualifiers(),
+                                             /*IsInitializer*/ false);
+                        CGF.LocalDeclMap.erase(VDInit);
+                      });
+                }
+                CGF.EmitAutoVarCleanups(Emission);
+                return Emission.getAllocatedAddress();
+              });
+        } else {
+          Address OriginalAddr = OriginalLVal.getAddress(CGF);
+          IsRegistered = PrivateScope.addPrivate(
+              OrigVD, [&CGF, VDInit, OriginalAddr, VD,
+                       ThisFirstprivateIsLastprivate, OrigVD, &Lastprivates,
+                       IRef, &IInit, &CGM, &CapturedVarsInfoMap, &FD]() {
+                Address VarAddr = OriginalAddr;
+                ASTContext &Ctx = CGF.getContext();
+                QualType UIntPtrTy = Ctx.getUIntPtrType();
+                llvm::Value *V = CapturedVarsInfoMap[OrigVD].PassedValue;
+                if (CapturedVarsInfoMap[OrigVD].CapturedKind ==
+                    CapturedVarInfo::ByValue) {
+                  CGF.setAddrOfLocalVar(VDInit, OriginalAddr);
+                  CharUnits AddrAlign = Ctx.getDeclAlign(&*VD);
+                  Address DeclPtr = CGF.CreateMemTemp(UIntPtrTy, AddrAlign,
+                                                      VD->getName() + ".addr");
+                  LValue DstLV = CGF.MakeAddrLValue(DeclPtr, UIntPtrTy,
+                                                    AlignmentSource::Decl);
+                  CGF.EmitStoreOfScalar(V, DstLV);
+                  if (!OrigVD->getType()->isPointerType()) {
+                    VarAddr = DeclPtr;
+                    if (VD->getType() != UIntPtrTy)
+                      VarAddr = castValueFromUintptr(
+                          CGF, (*IRef)->getExprLoc(), VD->getType(),
+                          VD->getName(),
+                          CGF.MakeAddrLValue(DeclPtr, UIntPtrTy));
+                    Address OMPAddress = getAddressOfLocalVariable(CGF, VD);
+                    if (OMPAddress.isValid()) {
+                      LValue VarAddrLV = CGF.MakeAddrLValue(
+                          VarAddr, VD->getType(), AlignmentSource::Decl);
+                      llvm::Value *CV =
+                          CGF.EmitLoadOfScalar(VarAddrLV, IInit->getBeginLoc());
+                      CGF.EmitStoreOfScalar(
+                          CV, CGF.MakeAddrLValue(OMPAddress, VD->getType(),
+                                                 AlignmentSource::Decl));
+                      VarAddr = OMPAddress;
+                    }
+                    CGF.setAddrOfLocalVar(VD, VarAddr);
+                  } else {
+                    llvm_unreachable("Unhandled Captured by Value VarDecl!");
+                  }
+                } else {
+                  QualType VDPtrTy = Ctx.getPointerType(VD->getType());
+                  CharUnits AddrAlign = Ctx.getDeclAlign(&*VD);
+                  Address DeclPtr = CGF.CreateMemTemp(VDPtrTy, AddrAlign,
+                                                      VD->getName() + ".addr");
+                  LValue DstLV = CGF.MakeAddrLValue(DeclPtr, VDPtrTy,
+                                                    AlignmentSource::Decl);
+                  CGF.EmitStoreOfScalar(V, DstLV);
+                  llvm::Value *PtrLd =
+                      CGF.EmitLoadOfScalar(DstLV, IInit->getBeginLoc());
+                  VarAddr = Address(PtrLd, AddrAlign);
+                  if (VD->getType() != VDPtrTy)
+                    VarAddr = castValueFromUintptr(
+                        CGF, (*IRef)->getExprLoc(), VD->getType(),
+                        VD->getName(), CGF.MakeAddrLValue(VarAddr, VDPtrTy));
+                  CGF.setAddrOfLocalVar(VD, VarAddr);
+                  CGF.setAddrOfLocalVar(VDInit, VarAddr);
+                  const auto *cleanups =
+                      dyn_cast<ExprWithCleanups>(VD->getInit());
+                  const Expr *Init =
+                      (cleanups) ? cleanups->getSubExpr() : VD->getInit();
+                  if (isa<CXXConstructExpr>(Init) &&
+                      !CGF.isTrivialInitializer(Init)) {
+                    if (cleanups) {
+                      CGF.enterFullExpression(cleanups);
+                      RunCleanupsScope InitScope(CGF);
+                    }
+
+                    Address DstPtr = CGF.CreateMemTemp(VD->getType(), AddrAlign,
+                                                       VD->getName());
+                    CGF.EmitAnyExprToMem(Init, DstPtr,
+                                         Init->getType().getQualifiers(),
+                                         /*IsInitializer*/ false);
+                    CGF.LocalDeclMap.erase(VDInit);
+                    VarAddr = DstPtr;
+                    // TODO emit cleanup info for variable
+                  }
+                }
+
+                CGF.LocalDeclMap.erase(VDInit);
+                if (ThisFirstprivateIsLastprivate &&
+                    Lastprivates[OrigVD->getCanonicalDecl()] ==
+                        OMPC_LASTPRIVATE_conditional) {
+                  // Create/init special variable for lastprivate conditionals.
+                  Address VDAddr = Address::invalid();
+                  VDAddr = emitLastprivateConditionalInit(CGF, OrigVD);
+                  llvm::Value *V = CGF.EmitLoadOfScalar(
+                      CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD),
+                                         (*IRef)->getType(),
+                                         AlignmentSource::Decl),
+                      (*IRef)->getExprLoc());
+                  CGF.EmitStoreOfScalar(
+                      V, CGF.MakeAddrLValue(VDAddr, (*IRef)->getType(),
+                                            AlignmentSource::Decl));
+                  CGF.LocalDeclMap.erase(VD);
+                  CGF.setAddrOfLocalVar(VD, VDAddr);
+                  return VDAddr;
+                }
+                return CGF.GetAddrOfLocalVar(VD);
+              });
+        }
+        assert(IsRegistered &&
+               "firstprivate var already registered as private");
+        // Silence the warning about unused variable.
+        (void)IsRegistered;
+      }
+      ++IRef;
+      ++InitsRef;
+    }
+  }
+  return FirstprivateIsLastprivate && !EmittedAsFirstprivate.empty();
+}
+
+void CodeGenFunction::OMPBuilderCBHelpers::GenerateOpenMPCapturedVars(
+    CodeGenFunction &CGF, const CapturedStmt &S,
+    llvm::SmallDenseMap<const VarDecl *, CapturedVarInfo>
+        &CapturedVarsValueMap) {
+  const RecordDecl *RD = S.getCapturedRecordDecl();
+  auto CurField = RD->field_begin();
+  auto CurCap = S.captures().begin();
+  for (CapturedStmt::const_capture_init_iterator I = S.capture_init_begin(),
+                                                 E = S.capture_init_end();
+       I != E; ++I, ++CurField, ++CurCap) {
+    if (CurField->hasCapturedVLAType() || CurCap->capturesThis()) {
+      // do nothing
+    } else if (CurCap->capturesVariableByCopy()) {
+      llvm::Value *CV =
+          CGF.EmitLoadOfScalar(CGF.EmitLValue(*I), CurCap->getLocation());
+
+      // If the field is not a pointer, we need to save the actual value
+      // and load it as a void pointer.
+      if (!CurField->getType()->isAnyPointerType()) {
+        ASTContext &Ctx = CGF.getContext();
+        Address DstAddr = CGF.CreateMemTemp(
+            Ctx.getUIntPtrType(),
+            Twine(CurCap->getCapturedVar()->getName(), ".casted"));
+        LValue DstLV = CGF.MakeAddrLValue(DstAddr, Ctx.getUIntPtrType());
+
+        llvm::Value *SrcAddrVal = CGF.EmitScalarConversion(
+            DstAddr.getPointer(), Ctx.getPointerType(Ctx.getUIntPtrType()),
+            Ctx.getPointerType(CurField->getType()), CurCap->getLocation());
+        LValue SrcLV =
+            CGF.MakeNaturalAlignAddrLValue(SrcAddrVal, CurField->getType());
+
+        // Store the value using the source type pointer.
+        CGF.EmitStoreThroughLValue(RValue::get(CV), SrcLV);
+
+        // Load the value using the destination type pointer.
+        CV = CGF.EmitLoadOfScalar(DstLV, CurCap->getLocation());
+      }
+      CapturedVarsValueMap[CurCap->getCapturedVar()] = {
+          CV, CapturedVarInfo::ByValue};
+    } else {
+      assert(CurCap->capturesVariable() && "Expected capture by reference.");
+      CapturedVarsValueMap[CurCap->getCapturedVar()] = {
+          CGF.EmitLValue(*I).getAddress(CGF).getPointer(),
+          CapturedVarInfo::ByRef};
+    }
+  }
+}
+
 void CodeGenFunction::EmitOMPParallelDirective(const OMPParallelDirective &S) {
   if (llvm::OpenMPIRBuilder *OMPBuilder = CGM.getOpenMPIRBuilder()) {
     // Check if we have any if clause associated with the directive.
@@ -1561,6 +1976,7 @@
                      llvm::Value &Val, llvm::Value *&ReplVal) {
       // The next line is appropriate only for variables (Val) with the
       // data-sharing attribute "shared".
+
       ReplVal = &Val;
 
       return CodeGenIP;
@@ -1569,17 +1985,70 @@
     const CapturedStmt *CS = S.getCapturedStmt(OMPD_parallel);
     const Stmt *ParallelRegionBodyStmt = CS->getCapturedStmt();
 
-    auto BodyGenCB = [ParallelRegionBodyStmt,
-                      this](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
-                            llvm::BasicBlock &ContinuationBB) {
+    OMPParallelScope Scope(*this, S);
+    llvm::SmallDenseMap<const VarDecl *, OMPBuilderCBHelpers::CapturedVarInfo>
+        CapturedVarsInfoMap;
+
+    OMPBuilderCBHelpers::GenerateOpenMPCapturedVars(*this, *CS,
+                                                    CapturedVarsInfoMap);
+
+    auto BodyGenCB = [ParallelRegionBodyStmt, &S, this, &OMPBuilder,
+                      &CapturedVarsInfoMap](InsertPointTy AllocaIP,
+                                            InsertPointTy CodeGenIP,
+                                            llvm::BasicBlock &ContinuationBB) {
       OMPBuilderCBHelpers::OutlinedRegionBodyRAII ORB(*this, AllocaIP,
                                                       ContinuationBB);
+
+      OMPPrivateScope PrivateScope(*this);
+      llvm::BasicBlock *OMP_Entry = AllocaIP.getBlock();
+      // Emitting Copyin clause
+      Builder.SetInsertPoint(OMP_Entry->getTerminator());
+      bool Copyins =
+          OMPBuilderCBHelpers::EmitOMPCopyinClause(*this, S, AllocaIP);
+
+      // restoring alloca insertion point to entry block since it moved while
+      // emitting 'copyin' blocks
+      AllocaInsertPt = OMPBuilderCBHelpers::GetAllocaInsertPoint(OMP_Entry);
+      llvm::BranchInst *EntryBI =
+          cast<llvm::BranchInst>(OMP_Entry->getTerminator());
+      EntryBI->removeFromParent();
+
+      if (Builder.GetInsertBlock() == OMP_Entry)
+        Builder.SetInsertPoint(OMP_Entry);
+      OMPBuilderCBHelpers::EmitOMPFirstprivateClause(*this, S, PrivateScope,
+                                                     CapturedVarsInfoMap);
+      if (Copyins) {
+        // Emit implicit barrier to synchronize threads and avoid data races on
+        // propagation master's thread values of threadprivate variables to
+        // local instances of that variables of all other implicit threads.
+        OMPBuilder->CreateBarrier(Builder, OMPD_barrier, /*EmitChecks=*/false,
+                                  /*ForceSimpleCall=*/true);
+      }
+
+      EmitOMPPrivateClause(S, PrivateScope);
+      (void)PrivateScope.Privatize();
+
+      if (!OMP_Entry->getTerminator()) {
+        OMP_Entry->getInstList().push_back(EntryBI);
+      } else if (Builder.GetInsertBlock()->getTerminator()) {
+        EntryBI->dropAllReferences();
+        EntryBI->deleteValue();
+      } else {
+        Builder.Insert(EntryBI);
+      }
+
       OMPBuilderCBHelpers::EmitOMPRegionBody(*this, ParallelRegionBodyStmt,
                                              CodeGenIP, ContinuationBB);
+      llvm::Instruction *ContTI = ContinuationBB.getTerminator();
+      ContTI->removeFromParent();
+      Builder.SetInsertPoint(&ContinuationBB);
+      PrivateScope.ForceCleanup();
+      Builder.Insert(ContTI);
     };
 
     CGCapturedStmtInfo CGSI(*CS, CR_OpenMP);
     CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(*this, &CGSI);
+
     Builder.restoreIP(OMPBuilder->CreateParallel(Builder, BodyGenCB, PrivCB,
                                                  FiniCB, IfCond, NumThreads,
                                                  ProcBind, S.hasCancel()));
@@ -3261,13 +3730,10 @@
     if (const auto *HintClause = S.getSingleClause<OMPHintClause>())
       Hint = HintClause->getHint();
 
-    // TODO: This is slightly different from what's currently being done in
-    // clang. Fix the Int32Ty to IntPtrTy (pointer width size) when everything
-    // about typing is final.
     llvm::Value *HintInst = nullptr;
     if (Hint)
       HintInst =
-          Builder.CreateIntCast(EmitScalarExpr(Hint), CGM.Int32Ty, false);
+          Builder.CreateIntCast(EmitScalarExpr(Hint), CGM.IntPtrTy, false);
 
     auto FiniCB = [this](InsertPointTy IP) {
       OMPBuilderCBHelpers::FinalizeOMPRegion(*this, IP);
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to