sfantao created this revision.
sfantao added reviewers: ABataev, hfinkel, rjmccall.
sfantao added a subscriber: cfe-commits.

This patch implements the outlining for offloading functions for code annotated 
with the OpenMP target directive. It uses a temporary naming of the outlined 
functions that will have to be updated later on once target side codegen and 
registration of offloading libraries is implemented - the naming needs to be 
made unique in the produced library.

Unlike other captured regions, target offloading cannot use directly the 
Capture declaration, as each captured field has to be passed explicitly to the 
runtime library and associated with potentially different mapping types 
(to/from/alloc...). Therefore, some tweaking in the function prologue codegen 
is required.  

The current implementation still do not support capturing of global variables. 
That requires a modification in Sema that I will propose in a separate patch.

Thanks!
Samuel

http://reviews.llvm.org/D11361

Files:
  lib/CodeGen/CGOpenMPRuntime.cpp
  lib/CodeGen/CGOpenMPRuntime.h
  lib/CodeGen/CGStmt.cpp
  lib/CodeGen/CGStmtOpenMP.cpp
  lib/CodeGen/CodeGenFunction.cpp
  lib/CodeGen/CodeGenFunction.h
  test/OpenMP/target_codegen.cpp

Index: test/OpenMP/target_codegen.cpp
===================================================================
--- /dev/null
+++ test/OpenMP/target_codegen.cpp
@@ -0,0 +1,583 @@
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// expected-no-diagnostics
+// REQUIRES: powerpc-registered-target
+#ifndef HEADER
+#define HEADER
+
+// CHECK-DAG: [[TT:%.+]] = type { i64, i8 }
+// CHECK-DAG: [[S1:%.+]] = type { double }
+
+// We have 8 target regions, but only 7 that actually will generate offloading
+// code, and only 6 will have mapped arguments.
+
+// CHECK-DAG: [[MAPT2:@.+]] = private constant [1 x i32] [i32 3]
+// CHECK-DAG: [[MAPT3:@.+]] = private constant [2 x i32] [i32 3, i32 3]
+// CHECK-DAG: [[MAPT4:@.+]] = private constant [9 x i32] [i32 3, i32 3, i32 1, i32 3, i32 3, i32 1, i32 1, i32 3, i32 3]
+// CHECK-DAG: [[MAPT5:@.+]] = private constant [3 x i32] [i32 3, i32 3, i32 3]
+// CHECK-DAG: [[MAPT6:@.+]] = private constant [4 x i32] [i32 3, i32 3, i32 3, i32 3]
+// CHECK-DAG: [[MAPT7:@.+]] = private constant [5 x i32] [i32 3, i32 3, i32 1, i32 1, i32 3]
+// CHECK-DAG: @{{.*}} = private constant i8 0
+// CHECK-DAG: @{{.*}} = private constant i8 0
+// CHECK-DAG: @{{.*}} = private constant i8 0
+// CHECK-DAG: @{{.*}} = private constant i8 0
+// CHECK-DAG: @{{.*}} = private constant i8 0
+// CHECK-DAG: @{{.*}} = private constant i8 0
+// CHECK-DAG: @{{.*}} = private constant i8 0
+
+template<typename tx, typename ty>
+struct TT{
+  tx X;
+  ty Y;
+};
+
+// CHECK: define {{.*}}[[FOO:@.+]](
+int foo(int n) {
+  int a = 0;
+  short aa = 0;
+  float b[10];
+  float bn[n];
+  double c[5][10];
+  double cn[5][n];
+  TT<long, char> d;
+
+  // CHECK:       br label %[[TRY:[^,]+]]
+  // CHECK:       [[TRY]]
+  // CHECK:       [[RET:%.+]] = call i32 @__tgt_target(i32 -1, i8* @{{[^,]+}}, i32 0, i8** null, i8** null, i64* null, i32* null)
+  // CHECK-NEXT:  [[ERROR:%.+]] = icmp ne i32 [[RET]], 0
+  // CHECK-NEXT:  br i1 [[ERROR]], label %[[FAIL:[^,]+]], label %[[END:[^,]+]]
+  // CHECK:       [[FAIL]]
+  // CHECK:       call void [[HVT0:@.+]]()
+  // CHECK-NEXT:  br label %[[END]]
+  // CHECK:       [[END]]
+  #pragma omp target
+  {
+  }
+
+  // CHECK:       call void [[HVT1:@.+]](i32* {{[^,]+}})
+  #pragma omp target if(0)
+  {
+    a += 1;
+  }
+
+  // CHECK:       br label %[[TRY:[^,]+]]
+  // CHECK:       [[TRY]]
+  // CHECK-DAG:   [[RET:%.+]] = call i32 @__tgt_target(i32 -1, i8* @{{[^,]+}}, i32 1, i8** [[BP:%[^,]+]], i8** [[P:%[^,]+]], i64* [[S:%[^,]+]], i32* getelementptr inbounds ([1 x i32], [1 x i32]* [[MAPT2]], i32 0, i32 0))
+
+  // CHECK-DAG:   store i64 4, i64* [[SADDR0:%.+]]
+  // CHECK-DAG:   [[SADDR0]] = getelementptr inbounds i64, i64* [[S]], i32 [[IDX0:[0-9]+]]
+  // CHECK-DAG:   [[BPADDR0:%.+]] = getelementptr inbounds i8*, i8** [[BP]], i32 [[IDX0]]
+  // CHECK-DAG:   [[PADDR0:%.+]] = getelementptr inbounds i8*, i8** [[P]], i32 [[IDX0]]
+  // CHECK-DAG:   store i8* [[BP0:%[^,]+]], i8** [[BPADDR0]]
+  // CHECK-DAG:   store i8* [[P0:%[^,]+]], i8** [[PADDR0]]
+  // CHECK-DAG:   [[BP0]] = bitcast i32* %{{.+}} to i8*
+  // CHECK-DAG:   [[P0]] = bitcast i32* %{{.+}} to i8*
+
+  // CHECK:       [[ERROR:%.+]] = icmp ne i32 [[RET]], 0
+  // CHECK-NEXT:  br i1 [[ERROR]], label %[[FAIL:[^,]+]], label %[[END:[^,]+]]
+  // CHECK:       [[FAIL]]
+  // CHECK:       call void [[HVT2:@.+]](i32* {{[^,]+}})
+  // CHECK-NEXT:  br label %[[END]]
+  // CHECK:       [[END]]
+  #pragma omp target if(1)
+  {
+    a += 1;
+  }
+
+  // CHECK:       [[IF:%.+]] = icmp sgt i32 {{[^,]+}}, 10
+  // CHECK:       br i1 [[IF]], label %[[TRY:[^,]+]], label %[[FAIL:[^,]+]]
+  // CHECK:       [[TRY]]
+  // CHECK-DAG:   [[RET:%.+]] = call i32 @__tgt_target(i32 -1, i8* @{{[^,]+}}, i32 2, i8** [[BP:%[^,]+]], i8** [[P:%[^,]+]], i64* [[S:%[^,]+]], i32* getelementptr inbounds ([2 x i32], [2 x i32]* [[MAPT3]], i32 0, i32 0))
+
+  // CHECK-DAG:   store i64 4, i64* [[SADDR0:%.+]]
+  // CHECK-DAG:   [[SADDR0]] = getelementptr inbounds i64, i64* [[S]], i32 [[IDX0:[0-9]+]]
+  // CHECK-DAG:   [[BPADDR0:%.+]] = getelementptr inbounds i8*, i8** [[BP]], i32 [[IDX0]]
+  // CHECK-DAG:   [[PADDR0:%.+]] = getelementptr inbounds i8*, i8** [[P]], i32 [[IDX0]]
+  // CHECK-DAG:   store i8* [[BP0:%[^,]+]], i8** [[BPADDR0]]
+  // CHECK-DAG:   store i8* [[P0:%[^,]+]], i8** [[PADDR0]]
+  // CHECK-DAG:   [[BP0]] = bitcast i32* %{{.+}} to i8*
+  // CHECK-DAG:   [[P0]] = bitcast i32* %{{.+}} to i8*
+
+  // CHECK-DAG:   store i64 2, i64* [[SADDR1:%.+]]
+  // CHECK-DAG:   [[SADDR1]] = getelementptr inbounds i64, i64* [[S]], i32 [[IDX1:[0-9]+]]
+  // CHECK-DAG:   [[BPADDR1:%.+]] = getelementptr inbounds i8*, i8** [[BP]], i32 [[IDX1]]
+  // CHECK-DAG:   [[PADDR1:%.+]] = getelementptr inbounds i8*, i8** [[P]], i32 [[IDX1]]
+  // CHECK-DAG:   store i8* [[BP1:%[^,]+]], i8** [[BPADDR1]]
+  // CHECK-DAG:   store i8* [[P1:%[^,]+]], i8** [[PADDR1]]
+  // CHECK-DAG:   [[BP1]] = bitcast i16* %{{.+}} to i8*
+  // CHECK-DAG:   [[P1]] = bitcast i16* %{{.+}} to i8*
+
+  // CHECK:       [[ERROR:%.+]] = icmp ne i32 [[RET]], 0
+  // CHECK-NEXT:  br i1 [[ERROR]], label %[[FAIL]], label %[[END:[^,]+]]
+  // CHECK:       [[FAIL]]
+  // CHECK:       call void [[HVT3:@.+]]({{[^,]+}}, {{[^,]+}})
+  // CHECK-NEXT:  br label %[[END]]
+  // CHECK:       [[END]]
+  #pragma omp target if(n>10)
+  {
+    a += 1;
+    aa += 1;
+  }
+
+  // We capture 3 VLA sizes in this target region
+  // CHECK-DAG:   store i64 %{{[^,]+}}, i64* [[VLA0:%[^,]+]]
+  // CHECK-DAG:   store i64 %{{[^,]+}}, i64* [[VLA1:%[^,]+]]
+  // CHECK-DAG:   store i64 %{{[^,]+}}, i64* [[VLA2:%[^,]+]]
+  // CHECK:       [[IF:%.+]] = icmp sgt i32 {{[^,]+}}, 20
+  // CHECK:       br i1 [[IF]], label %[[TRY:[^,]+]], label %[[FAIL:[^,]+]]
+  // CHECK:       [[TRY]]
+  // CHECK-DAG:   [[RET:%.+]] = call i32 @__tgt_target(i32 -1, i8* @{{[^,]+}}, i32 9, i8** [[BP:%[^,]+]], i8** [[P:%[^,]+]], i64* [[S:%[^,]+]], i32* getelementptr inbounds ([9 x i32], [9 x i32]* [[MAPT4]], i32 0, i32 0))
+
+  // CHECK-DAG:   [[SADDR0:%.+]] = getelementptr inbounds i64, i64* [[S]], i32 [[IDX0:[0-9]+]]
+  // CHECK-DAG:   [[BPADDR0:%.+]] = getelementptr inbounds i8*, i8** [[BP]], i32 [[IDX0]]
+  // CHECK-DAG:   [[PADDR0:%.+]] = getelementptr inbounds i8*, i8** [[P]], i32 [[IDX0]]
+  // CHECK-DAG:   [[SADDR1:%.+]] = getelementptr inbounds i64, i64* [[S]], i32 [[IDX1:[0-9]+]]
+  // CHECK-DAG:   [[BPADDR1:%.+]] = getelementptr inbounds i8*, i8** [[BP]], i32 [[IDX1]]
+  // CHECK-DAG:   [[PADDR1:%.+]] = getelementptr inbounds i8*, i8** [[P]], i32 [[IDX1]]
+  // CHECK-DAG:   [[SADDR2:%.+]] = getelementptr inbounds i64, i64* [[S]], i32 [[IDX2:[0-9]+]]
+  // CHECK-DAG:   [[BPADDR2:%.+]] = getelementptr inbounds i8*, i8** [[BP]], i32 [[IDX2]]
+  // CHECK-DAG:   [[PADDR2:%.+]] = getelementptr inbounds i8*, i8** [[P]], i32 [[IDX2]]
+  // CHECK-DAG:   [[SADDR3:%.+]] = getelementptr inbounds i64, i64* [[S]], i32 [[IDX3:[0-9]+]]
+  // CHECK-DAG:   [[BPADDR3:%.+]] = getelementptr inbounds i8*, i8** [[BP]], i32 [[IDX3]]
+  // CHECK-DAG:   [[PADDR3:%.+]] = getelementptr inbounds i8*, i8** [[P]], i32 [[IDX3]]
+  // CHECK-DAG:   [[SADDR4:%.+]] = getelementptr inbounds i64, i64* [[S]], i32 [[IDX4:[0-9]+]]
+  // CHECK-DAG:   [[BPADDR4:%.+]] = getelementptr inbounds i8*, i8** [[BP]], i32 [[IDX4]]
+  // CHECK-DAG:   [[PADDR4:%.+]] = getelementptr inbounds i8*, i8** [[P]], i32 [[IDX4]]
+  // CHECK-DAG:   [[SADDR5:%.+]] = getelementptr inbounds i64, i64* [[S]], i32 [[IDX5:[0-9]+]]
+  // CHECK-DAG:   [[BPADDR5:%.+]] = getelementptr inbounds i8*, i8** [[BP]], i32 [[IDX5]]
+  // CHECK-DAG:   [[PADDR5:%.+]] = getelementptr inbounds i8*, i8** [[P]], i32 [[IDX5]]
+  // CHECK-DAG:   [[SADDR6:%.+]] = getelementptr inbounds i64, i64* [[S]], i32 [[IDX6:[0-9]+]]
+  // CHECK-DAG:   [[BPADDR6:%.+]] = getelementptr inbounds i8*, i8** [[BP]], i32 [[IDX6]]
+  // CHECK-DAG:   [[PADDR6:%.+]] = getelementptr inbounds i8*, i8** [[P]], i32 [[IDX6]]
+  // CHECK-DAG:   [[SADDR7:%.+]] = getelementptr inbounds i64, i64* [[S]], i32 [[IDX7:[0-9]+]]
+  // CHECK-DAG:   [[BPADDR7:%.+]] = getelementptr inbounds i8*, i8** [[BP]], i32 [[IDX7]]
+  // CHECK-DAG:   [[PADDR7:%.+]] = getelementptr inbounds i8*, i8** [[P]], i32 [[IDX7]]
+  // CHECK-DAG:   [[SADDR8:%.+]] = getelementptr inbounds i64, i64* [[S]], i32 [[IDX8:[0-9]+]]
+  // CHECK-DAG:   [[BPADDR8:%.+]] = getelementptr inbounds i8*, i8** [[BP]], i32 [[IDX8]]
+  // CHECK-DAG:   [[PADDR8:%.+]] = getelementptr inbounds i8*, i8** [[P]], i32 [[IDX8]]
+
+  // The names below are not necessarily consistent with the names used for the
+  // addresses above as some are repeated.
+  // CHECK-DAG:   [[BP0:%[^,]+]] = bitcast i64* [[VLA0]] to i8*
+  // CHECK-DAG:   [[P0:%[^,]+]] = bitcast i64* [[VLA0]] to i8*
+  // CHECK-DAG:   store i8* [[BP0]], i8** {{%[^,]+}}
+  // CHECK-DAG:   store i8* [[P0]], i8** {{%[^,]+}}
+  // CHECK-DAG:   store i64 8, i64* {{%[^,]+}}
+
+  // CHECK-DAG:   [[BP1:%[^,]+]] = bitcast i64* [[VLA1]] to i8*
+  // CHECK-DAG:   [[P1:%[^,]+]] = bitcast i64* [[VLA1]] to i8*
+  // CHECK-DAG:   store i8* [[BP1]], i8** {{%[^,]+}}
+  // CHECK-DAG:   store i8* [[P1]], i8** {{%[^,]+}}
+  // CHECK-DAG:   store i64 8, i64* {{%[^,]+}}
+
+  // CHECK-DAG:   [[BP2:%[^,]+]] = bitcast i64* [[VLA2]] to i8*
+  // CHECK-DAG:   [[P2:%[^,]+]] = bitcast i64* [[VLA2]] to i8*
+  // CHECK-DAG:   store i8* [[BP2]], i8** {{%[^,]+}}
+  // CHECK-DAG:   store i8* [[P2]], i8** {{%[^,]+}}
+  // CHECK-DAG:   store i64 8, i64* {{%[^,]+}}
+
+  // CHECK-DAG:   [[BP3:%[^,]+]] = bitcast i32* %{{.+}} to i8*
+  // CHECK-DAG:   [[P3:%[^,]+]] = bitcast i32* %{{.+}} to i8*
+  // CHECK-DAG:   store i8* [[BP3]], i8** {{%[^,]+}}
+  // CHECK-DAG:   store i8* [[P3]], i8** {{%[^,]+}}
+  // CHECK-DAG:   store i64 4, i64* {{%[^,]+}}
+
+  // CHECK-DAG:   [[BP4:%[^,]+]] = bitcast [10 x float]* %{{.+}} to i8*
+  // CHECK-DAG:   [[P4:%[^,]+]] = bitcast [10 x float]* %{{.+}} to i8*
+  // CHECK-DAG:   store i8* [[BP4]], i8** {{%[^,]+}}
+  // CHECK-DAG:   store i8* [[P4]], i8** {{%[^,]+}}
+  // CHECK-DAG:   store i64 40, i64* {{%[^,]+}}
+
+  // CHECK-DAG:   [[BP5:%[^,]+]] = bitcast float* %{{.+}} to i8*
+  // CHECK-DAG:   [[P5:%[^,]+]] = bitcast float* %{{.+}} to i8*
+  // CHECK-DAG:   store i8* [[BP5]], i8** {{%[^,]+}}
+  // CHECK-DAG:   store i8* [[P5]], i8** {{%[^,]+}}
+  // CHECK-DAG:   store i64 4, i64* {{%[^,]+}}
+
+  // CHECK-DAG:   [[BP6:%[^,]+]] = bitcast [5 x [10 x double]]* %{{.+}} to i8*
+  // CHECK-DAG:   [[P6:%[^,]+]] = bitcast [5 x [10 x double]]* %{{.+}} to i8*
+  // CHECK-DAG:   store i8* [[BP6]], i8** {{%[^,]+}}
+  // CHECK-DAG:   store i8* [[P6]], i8** {{%[^,]+}}
+  // CHECK-DAG:   store i64 400, i64* {{%[^,]+}}
+
+  // CHECK-DAG:   [[BP7:%[^,]+]] = bitcast double* %{{.+}} to i8*
+  // CHECK-DAG:   [[P7:%[^,]+]] = bitcast double* %{{.+}} to i8*
+  // CHECK-DAG:   store i8* [[BP7]], i8** {{%[^,]+}}
+  // CHECK-DAG:   store i8* [[P7]], i8** {{%[^,]+}}
+  // CHECK-DAG:   store i64 8, i64* {{%[^,]+}}
+
+  // CHECK-DAG:   [[BP8:%[^,]+]] = bitcast [[TT]]* %{{.+}} to i8*
+  // CHECK-DAG:   [[P8:%[^,]+]] = bitcast [[TT]]* %{{.+}} to i8*
+  // CHECK-DAG:   store i8* [[BP8]], i8** {{%[^,]+}}
+  // CHECK-DAG:   store i8* [[P8]], i8** {{%[^,]+}}
+  // CHECK-DAG:   store i64 16, i64* {{%[^,]+}}
+
+  // CHECK:       [[ERROR:%.+]] = icmp ne i32 [[RET]], 0
+  // CHECK-NEXT:  br i1 [[ERROR]], label %[[FAIL]], label %[[END:[^,]+]]
+  // CHECK:       [[FAIL]]
+  // CHECK:       call void [[HVT4:@.+]]({{[^,]+}}, {{[^,]+}}, {{[^,]+}}, {{[^,]+}}, {{[^,]+}}, {{[^,]+}}, {{[^,]+}}, {{[^,]+}}, {{[^,]+}})
+  // CHECK-NEXT:  br label %[[END]]
+  // CHECK:       [[END]]
+  #pragma omp target if(n>20)
+  {
+    a += 1;
+    b[2] += 1.0;
+    bn[3] += 1.0;
+    c[1][2] += 1.0;
+    cn[1][3] += 1.0;
+    d.X += 1;
+    d.Y += 1;
+  }
+
+  return a;
+}
+
+// Check that the offloading functions are emitted and that the arguments are
+// correct and loaded correctly for the target regions in foo().
+
+// CHECK:       define internal void [[HVT0]]
+
+// CHECK:       define internal void [[HVT1]]
+// CHECK-DAG:   [[LOCALX_A:%.+]] = alloca i32*
+// CHECK-DAG:   store i32* [[ARG_A:%.+]], i32** [[LOCALX_A]]
+// CHECK-DAG:   [[USE_A:%.+]] = load i32*, i32** [[LOCALX_A:%.+]]
+// CHECK-DAG:   load i32, i32* [[USE_A]]
+
+// CHECK:       define internal void [[HVT2]]
+// CHECK-DAG:   [[LOCALX_A:%.+]] = alloca i32*
+// CHECK-DAG:   store i32* [[ARG_A:%.+]], i32** [[LOCALX_A]]
+// CHECK-DAG:   [[USE_A:%.+]] = load i32*, i32** [[LOCALX_A:%.+]]
+// CHECK-DAG:   load i32, i32* [[USE_A]]
+
+// CHECK:       define internal void [[HVT3]]
+// CHECK-DAG:   [[LOCALX_A:%.+]] = alloca i32*
+// CHECK-DAG:   [[LOCALX_AA:%.+]] = alloca i16*
+// CHECK-DAG:   store i32* [[ARG_A:%.+]], i32** [[LOCALX_A]]
+// CHECK-DAG:   store i16* [[ARG_AA:%.+]], i16** [[LOCALX_AA]]
+// CHECK-DAG:   [[USE_A:%.+]] = load i32*, i32** [[LOCALX_A:%.+]]
+// CHECK-DAG:   [[USE_AA:%.+]] = load i16*, i16** [[LOCALX_AA:%.+]]
+// CHECK-DAG:   load i32, i32* [[USE_A]]
+// CHECK-DAG:   load i16, i16* [[USE_AA]]
+
+// CHECK:       define internal void [[HVT4]]
+// CHECK-DAG:   [[LOCALX_A:%.+]] = alloca i32*
+// CHECK-DAG:   [[LOCALX_B:%.+]] = alloca [10 x float]*
+// CHECK-DAG:   [[LOCALX_BN:%.+]] = alloca float*
+// CHECK-DAG:   [[LOCALX_C:%.+]] = alloca [5 x [10 x double]]*
+// CHECK-DAG:   [[LOCALX_CN:%.+]] = alloca double*
+// CHECK-DAG:   [[LOCALX_D:%.+]] = alloca [[TT]]*
+// CHECK-DAG:   [[LOCALX_VLA1:%.+]] = alloca i64*
+// CHECK-DAG:   [[LOCALX_VLA2:%.+]] = alloca i64*
+// CHECK-DAG:   [[LOCALX_VLA3:%.+]] = alloca i64*
+// CHECK-DAG:   store i32* [[ARG_A:%.+]], i32** [[LOCALX_A]]
+// CHECK-DAG:   store [10 x float]* [[ARG_B:%.+]], [10 x float]** [[LOCALX_B]]
+// CHECK-DAG:   store float* [[ARG_BN:%.+]], float** [[LOCALX_BN]]
+// CHECK-DAG:   store [5 x [10 x double]]* [[ARG_C:%.+]], [5 x [10 x double]]** [[LOCALX_C]]
+// CHECK-DAG:   store double* [[ARG_CN:%.+]], double** [[LOCALX_CN]]
+// CHECK-DAG:   store [[TT]]* [[ARG_D:%.+]], [[TT]]** [[LOCALX_D]]
+// CHECK-DAG:   store i64* [[ARG_VLA1:%.+]], i64** [[LOCALX_VLA1]]
+// CHECK-DAG:   store i64* [[ARG_VLA2:%.+]], i64** [[LOCALX_VLA2]]
+// CHECK-DAG:   store i64* [[ARG_VLA3:%.+]], i64** [[LOCALX_VLA3]]
+// CHECK-DAG:   [[USE_A:%.+]] = load i32*, i32** [[LOCALX_A:%.+]]
+// CHECK-DAG:   [[USE_B:%.+]] = load [10 x float]*, [10 x float]** [[LOCALX_B:%.+]]
+// CHECK-DAG:   [[USE_BN:%.+]] = load float*, float** [[LOCALX_BN:%.+]]
+// CHECK-DAG:   [[USE_C:%.+]] = load [5 x [10 x double]]*, [5 x [10 x double]]** [[LOCALX_C:%.+]]
+// CHECK-DAG:   [[USE_CN:%.+]] = load double*, double** [[LOCALX_CN:%.+]]
+// CHECK-DAG:   [[USE_D:%.+]] = load [[TT]]*, [[TT]]** [[LOCALX_D:%.+]]
+// CHECK-DAG:   [[USE_VLA1:%.+]] = load i64*, i64** [[LOCALX_VLA1:%.+]]
+// CHECK-DAG:   [[USE_VLA2:%.+]] = load i64*, i64** [[LOCALX_VLA2:%.+]]
+// CHECK-DAG:   [[USE_VLA3:%.+]] = load i64*, i64** [[LOCALX_VLA3:%.+]]
+// CHECK-DAG:   load i32, i32* [[USE_A]]
+// CHECK-DAG:   getelementptr inbounds [10 x float], [10 x float]* [[USE_B]], i{{.*}} 0, i{{.*}} 2
+// CHECK-DAG:   getelementptr inbounds float, float* [[USE_BN]], i{{.*}} 3
+// CHECK-DAG:   getelementptr inbounds [5 x [10 x double]], [5 x [10 x double]]* [[USE_C]], i{{.*}} 0, i{{.*}} 1
+// CHECK-DAG:   [[VLAMUL:%.+]] = mul {{.*}}i64 1, %{{.*}}
+// CHECK-DAG:   getelementptr inbounds double, double* [[USE_CN]], i{{.*}} [[VLAMUL]]
+// CHECK-DAG:   load i64, i64* [[USE_VLA1]]
+// CHECK-DAG:   load i64, i64* [[USE_VLA2]]
+// CHECK-DAG:   load i64, i64* [[USE_VLA3]]
+
+template<typename tx>
+tx ftemplate(int n) {
+  tx a = 0;
+  short aa = 0;
+  tx b[10];
+
+  #pragma omp target if(n>40)
+  {
+    a += 1;
+    aa += 1;
+    b[2] += 1;
+  }
+
+  return a;
+}
+
+static
+int fstatic(int n) {
+  int a = 0;
+  short aa = 0;
+  char aaa = 0;
+  int b[10];
+
+  #pragma omp target if(n>50)
+  {
+    a += 1;
+    aa += 1;
+    aaa += 1;
+    b[2] += 1;
+  }
+
+  return a;
+}
+
+struct S1 {
+  double a;
+
+  int r1(int n){
+    int b = n+1;
+    short int c[2][n];
+
+    #pragma omp target if(n>60)
+    {
+      this->a = (double)b + 1.5;
+      c[1][1] = ++a;
+    }
+
+    return c[1][1] + (int)b;
+  }
+};
+
+// CHECK: define {{.*}}@{{.*}}bar{{.*}}
+int bar(int n){
+  int a = 0;
+
+  // CHECK: call {{.*}}i32 [[FOO]](i32 {{.*}})
+  a += foo(n);
+
+  S1 S;
+  // CHECK: call {{.*}}i32 [[FS1:@.+]]([[S1]]* {{.*}}, i32 {{.*}})
+  a += S.r1(n);
+
+  // CHECK: call {{.*}}i32 [[FSTATIC:@.+]](i32 {{.*}})
+  a += fstatic(n);
+
+  // CHECK: call {{.*}}i32 [[FTEMPLATE:@.+]](i32 {{.*}})
+  a += ftemplate<int>(n);
+
+  return a;
+}
+
+//
+// CHECK: define {{.*}}[[FS1]]
+//
+// We capture 2 VLA sizes in this target region
+// CHECK-DAG:   store i64 %{{[^,]+}}, i64* [[VLA0:%[^,]+]]
+// CHECK-DAG:   store i64 %{{[^,]+}}, i64* [[VLA1:%[^,]+]]
+// CHECK:       [[IF:%.+]] = icmp sgt i32 {{[^,]+}}, 60
+// CHECK:       br i1 [[IF]], label %[[TRY:[^,]+]], label %[[FAIL:[^,]+]]
+// CHECK:       [[TRY]]
+// CHECK-DAG:   [[RET:%.+]] = call i32 @__tgt_target(i32 -1, i8* @{{[^,]+}}, i32 5, i8** [[BP:%[^,]+]], i8** [[P:%[^,]+]], i64* [[S:%[^,]+]], i32* getelementptr inbounds ([5 x i32], [5 x i32]* [[MAPT7]], i32 0, i32 0))
+
+// CHECK-DAG:   [[SADDR0:%.+]] = getelementptr inbounds i64, i64* [[S]], i32 [[IDX0:[0-9]+]]
+// CHECK-DAG:   [[BPADDR0:%.+]] = getelementptr inbounds i8*, i8** [[BP]], i32 [[IDX0]]
+// CHECK-DAG:   [[PADDR0:%.+]] = getelementptr inbounds i8*, i8** [[P]], i32 [[IDX0]]
+// CHECK-DAG:   [[SADDR1:%.+]] = getelementptr inbounds i64, i64* [[S]], i32 [[IDX1:[0-9]+]]
+// CHECK-DAG:   [[BPADDR1:%.+]] = getelementptr inbounds i8*, i8** [[BP]], i32 [[IDX1]]
+// CHECK-DAG:   [[PADDR1:%.+]] = getelementptr inbounds i8*, i8** [[P]], i32 [[IDX1]]
+// CHECK-DAG:   [[SADDR2:%.+]] = getelementptr inbounds i64, i64* [[S]], i32 [[IDX2:[0-9]+]]
+// CHECK-DAG:   [[BPADDR2:%.+]] = getelementptr inbounds i8*, i8** [[BP]], i32 [[IDX2]]
+// CHECK-DAG:   [[PADDR2:%.+]] = getelementptr inbounds i8*, i8** [[P]], i32 [[IDX2]]
+// CHECK-DAG:   [[SADDR3:%.+]] = getelementptr inbounds i64, i64* [[S]], i32 [[IDX3:[0-9]+]]
+// CHECK-DAG:   [[BPADDR3:%.+]] = getelementptr inbounds i8*, i8** [[BP]], i32 [[IDX3]]
+// CHECK-DAG:   [[PADDR3:%.+]] = getelementptr inbounds i8*, i8** [[P]], i32 [[IDX3]]
+
+// The names below are not necessarily consistent with the names used for the
+// addresses above as some are repeated.
+// CHECK-DAG:   [[BP0:%[^,]+]] = bitcast i64* [[VLA0]] to i8*
+// CHECK-DAG:   [[P0:%[^,]+]] = bitcast i64* [[VLA0]] to i8*
+// CHECK-DAG:   store i8* [[BP0]], i8** {{%[^,]+}}
+// CHECK-DAG:   store i8* [[P0]], i8** {{%[^,]+}}
+// CHECK-DAG:   store i64 8, i64* {{%[^,]+}}
+
+// CHECK-DAG:   [[BP1:%[^,]+]] = bitcast i64* [[VLA1]] to i8*
+// CHECK-DAG:   [[P1:%[^,]+]] = bitcast i64* [[VLA1]] to i8*
+// CHECK-DAG:   store i8* [[BP1]], i8** {{%[^,]+}}
+// CHECK-DAG:   store i8* [[P1]], i8** {{%[^,]+}}
+// CHECK-DAG:   store i64 8, i64* {{%[^,]+}}
+
+// CHECK-DAG:   [[BP2:%[^,]+]] = bitcast i32* %{{.+}} to i8*
+// CHECK-DAG:   [[P2:%[^,]+]] = bitcast i32* %{{.+}} to i8*
+// CHECK-DAG:   store i8* [[BP2]], i8** {{%[^,]+}}
+// CHECK-DAG:   store i8* [[P2]], i8** {{%[^,]+}}
+// CHECK-DAG:   store i64 4, i64* {{%[^,]+}}
+
+// CHECK-DAG:   [[BP3:%[^,]+]] = bitcast [[S1]]* %{{.+}} to i8*
+// CHECK-DAG:   [[P3:%[^,]+]] = bitcast [[S1]]* %{{.+}} to i8*
+// CHECK-DAG:   store i8* [[BP3]], i8** {{%[^,]+}}
+// CHECK-DAG:   store i8* [[P3]], i8** {{%[^,]+}}
+// CHECK-DAG:   store i64 8, i64* {{%[^,]+}}
+
+// CHECK-DAG:   [[BP4:%[^,]+]] = bitcast i16* %{{.+}} to i8*
+// CHECK-DAG:   [[P4:%[^,]+]] = bitcast i16* %{{.+}} to i8*
+// CHECK-DAG:   store i8* [[BP4]], i8** {{%[^,]+}}
+// CHECK-DAG:   store i8* [[P4]], i8** {{%[^,]+}}
+// CHECK-DAG:   store i64 2, i64* {{%[^,]+}}
+
+// CHECK:       [[ERROR:%.+]] = icmp ne i32 [[RET]], 0
+// CHECK-NEXT:  br i1 [[ERROR]], label %[[FAIL]], label %[[END:[^,]+]]
+// CHECK:       [[FAIL]]
+// CHECK:       call void [[HVT7:@.+]]({{[^,]+}}, {{[^,]+}}, {{[^,]+}}, {{[^,]+}}, {{[^,]+}})
+// CHECK-NEXT:  br label %[[END]]
+// CHECK:       [[END]]
+
+//
+// CHECK: define {{.*}}[[FSTATIC]]
+//
+// CHECK:       [[IF:%.+]] = icmp sgt i32 {{[^,]+}}, 50
+// CHECK:       br i1 [[IF]], label %[[TRY:[^,]+]], label %[[FAIL:[^,]+]]
+// CHECK:       [[TRY]]
+// CHECK-DAG:   [[RET:%.+]] = call i32 @__tgt_target(i32 -1, i8* @{{[^,]+}}, i32 4, i8** [[BP:%[^,]+]], i8** [[P:%[^,]+]], i64* [[S:%[^,]+]], i32* getelementptr inbounds ([4 x i32], [4 x i32]* [[MAPT6]], i32 0, i32 0))
+
+// CHECK-DAG:   store i64 4, i64* [[SADDR0:%.+]]
+// CHECK-DAG:   [[SADDR0]] = getelementptr inbounds i64, i64* [[S]], i32 [[IDX0:[0-9]+]]
+// CHECK-DAG:   [[BPADDR0:%.+]] = getelementptr inbounds i8*, i8** [[BP]], i32 [[IDX0]]
+// CHECK-DAG:   [[PADDR0:%.+]] = getelementptr inbounds i8*, i8** [[P]], i32 [[IDX0]]
+// CHECK-DAG:   store i8* [[BP0:%[^,]+]], i8** [[BPADDR0]]
+// CHECK-DAG:   store i8* [[P0:%[^,]+]], i8** [[PADDR0]]
+// CHECK-DAG:   [[BP0]] = bitcast i32* %{{.+}} to i8*
+// CHECK-DAG:   [[P0]] = bitcast i32* %{{.+}} to i8*
+
+// CHECK-DAG:   store i64 2, i64* [[SADDR1:%.+]]
+// CHECK-DAG:   [[SADDR1]] = getelementptr inbounds i64, i64* [[S]], i32 [[IDX1:[0-9]+]]
+// CHECK-DAG:   [[BPADDR1:%.+]] = getelementptr inbounds i8*, i8** [[BP]], i32 [[IDX1]]
+// CHECK-DAG:   [[PADDR1:%.+]] = getelementptr inbounds i8*, i8** [[P]], i32 [[IDX1]]
+// CHECK-DAG:   store i8* [[BP1:%[^,]+]], i8** [[BPADDR1]]
+// CHECK-DAG:   store i8* [[P1:%[^,]+]], i8** [[PADDR1]]
+// CHECK-DAG:   [[BP1]] = bitcast i16* %{{.+}} to i8*
+// CHECK-DAG:   [[P1]] = bitcast i16* %{{.+}} to i8*
+
+// CHECK-DAG:   store i64 1, i64* [[SADDR2:%.+]]
+// CHECK-DAG:   [[SADDR2]] = getelementptr inbounds i64, i64* [[S]], i32 [[IDX2:[0-9]+]]
+// CHECK-DAG:   [[BPADDR2:%.+]] = getelementptr inbounds i8*, i8** [[BP]], i32 [[IDX2]]
+// CHECK-DAG:   [[PADDR2:%.+]] = getelementptr inbounds i8*, i8** [[P]], i32 [[IDX2]]
+// CHECK-DAG:   store i8* [[BP2:%[^,]+]], i8** [[BPADDR2]]
+// CHECK-DAG:   store i8* [[P2:%[^,]+]], i8** [[PADDR2]]
+
+// CHECK-DAG:   store i64 40, i64* [[SADDR3:%.+]]
+// CHECK-DAG:   [[SADDR3]] = getelementptr inbounds i64, i64* [[S]], i32 [[IDX3:[0-9]+]]
+// CHECK-DAG:   [[BPADDR3:%.+]] = getelementptr inbounds i8*, i8** [[BP]], i32 [[IDX3]]
+// CHECK-DAG:   [[PADDR3:%.+]] = getelementptr inbounds i8*, i8** [[P]], i32 [[IDX3]]
+// CHECK-DAG:   store i8* [[BP3:%[^,]+]], i8** [[BPADDR3]]
+// CHECK-DAG:   store i8* [[P3:%[^,]+]], i8** [[PADDR3]]
+// CHECK-DAG:   [[BP3]] = bitcast [10 x i32]* %{{.+}} to i8*
+// CHECK-DAG:   [[P3]] = bitcast [10 x i32]* %{{.+}} to i8*
+
+// CHECK:       [[ERROR:%.+]] = icmp ne i32 [[RET]], 0
+// CHECK-NEXT:  br i1 [[ERROR]], label %[[FAIL]], label %[[END:[^,]+]]
+// CHECK:       [[FAIL]]
+// CHECK:       call void [[HVT6:@.+]]({{[^,]+}}, {{[^,]+}}, {{[^,]+}}, {{[^,]+}})
+// CHECK-NEXT:  br label %[[END]]
+// CHECK:       [[END]]
+
+//
+// CHECK: define {{.*}}[[FTEMPLATE]]
+//
+// CHECK:       [[IF:%.+]] = icmp sgt i32 {{[^,]+}}, 40
+// CHECK:       br i1 [[IF]], label %[[TRY:[^,]+]], label %[[FAIL:[^,]+]]
+// CHECK:       [[TRY]]
+// CHECK-DAG:   [[RET:%.+]] = call i32 @__tgt_target(i32 -1, i8* @{{[^,]+}}, i32 3, i8** [[BP:%[^,]+]], i8** [[P:%[^,]+]], i64* [[S:%[^,]+]], i32* getelementptr inbounds ([3 x i32], [3 x i32]* [[MAPT5]], i32 0, i32 0))
+
+// CHECK-DAG:   store i64 4, i64* [[SADDR0:%.+]]
+// CHECK-DAG:   [[SADDR0]] = getelementptr inbounds i64, i64* [[S]], i32 [[IDX0:[0-9]+]]
+// CHECK-DAG:   [[BPADDR0:%.+]] = getelementptr inbounds i8*, i8** [[BP]], i32 [[IDX0]]
+// CHECK-DAG:   [[PADDR0:%.+]] = getelementptr inbounds i8*, i8** [[P]], i32 [[IDX0]]
+// CHECK-DAG:   store i8* [[BP0:%[^,]+]], i8** [[BPADDR0]]
+// CHECK-DAG:   store i8* [[P0:%[^,]+]], i8** [[PADDR0]]
+// CHECK-DAG:   [[BP0]] = bitcast i32* %{{.+}} to i8*
+// CHECK-DAG:   [[P0]] = bitcast i32* %{{.+}} to i8*
+
+// CHECK-DAG:   store i64 2, i64* [[SADDR1:%.+]]
+// CHECK-DAG:   [[SADDR1]] = getelementptr inbounds i64, i64* [[S]], i32 [[IDX1:[0-9]+]]
+// CHECK-DAG:   [[BPADDR1:%.+]] = getelementptr inbounds i8*, i8** [[BP]], i32 [[IDX1]]
+// CHECK-DAG:   [[PADDR1:%.+]] = getelementptr inbounds i8*, i8** [[P]], i32 [[IDX1]]
+// CHECK-DAG:   store i8* [[BP1:%[^,]+]], i8** [[BPADDR1]]
+// CHECK-DAG:   store i8* [[P1:%[^,]+]], i8** [[PADDR1]]
+// CHECK-DAG:   [[BP1]] = bitcast i16* %{{.+}} to i8*
+// CHECK-DAG:   [[P1]] = bitcast i16* %{{.+}} to i8*
+
+// CHECK-DAG:   store i64 40, i64* [[SADDR2:%.+]]
+// CHECK-DAG:   [[SADDR2]] = getelementptr inbounds i64, i64* [[S]], i32 [[IDX2:[0-9]+]]
+// CHECK-DAG:   [[BPADDR2:%.+]] = getelementptr inbounds i8*, i8** [[BP]], i32 [[IDX2]]
+// CHECK-DAG:   [[PADDR2:%.+]] = getelementptr inbounds i8*, i8** [[P]], i32 [[IDX2]]
+// CHECK-DAG:   store i8* [[BP2:%[^,]+]], i8** [[BPADDR2]]
+// CHECK-DAG:   store i8* [[P2:%[^,]+]], i8** [[PADDR2]]
+// CHECK-DAG:   [[BP2]] = bitcast [10 x i32]* %{{.+}} to i8*
+// CHECK-DAG:   [[P2]] = bitcast [10 x i32]* %{{.+}} to i8*
+
+// CHECK:       [[ERROR:%.+]] = icmp ne i32 [[RET]], 0
+// CHECK-NEXT:  br i1 [[ERROR]], label %[[FAIL]], label %[[END:[^,]+]]
+// CHECK:       [[FAIL]]
+// CHECK:       call void [[HVT5:@.+]]({{[^,]+}}, {{[^,]+}}, {{[^,]+}})
+// CHECK-NEXT:  br label %[[END]]
+// CHECK:       [[END]]
+
+// Check that the offloading functions are emitted and that the arguments are
+// correct and loaded correctly for the target regions of the callees of bar().
+
+// CHECK:       define internal void [[HVT7]]
+// CHECK-DAG:   [[LOCALX_THIS:%.+]] = alloca [[S1]]*
+// CHECK-DAG:   [[LOCALX_B:%.+]] = alloca i32*
+// CHECK-DAG:   [[LOCALX_C:%.+]] = alloca i16*
+// CHECK-DAG:   [[LOCALX_VLA1:%.+]] = alloca i64*
+// CHECK-DAG:   [[LOCALX_VLA2:%.+]] = alloca i64*
+// CHECK-DAG:   store [[S1]]* [[ARG_THIS:%.+]], [[S1]]** [[LOCALX_THIS]]
+// CHECK-DAG:   store i32* [[ARG_B:%.+]], i32** [[LOCALX_B]]
+// CHECK-DAG:   store i16* [[ARG_C:%.+]], i16** [[LOCALX_C]]
+// CHECK-DAG:   store i64* [[ARG_VLA1:%.+]], i64** [[LOCALX_VLA1]]
+// CHECK-DAG:   store i64* [[ARG_VLA2:%.+]], i64** [[LOCALX_VLA2]]
+// CHECK-DAG:   [[USE_THIS:%.+]] = load [[S1]]*, [[S1]]** [[LOCALX_THIS]]
+// CHECK-DAG:   [[USE_B:%.+]] = load i32*, i32** [[LOCALX_B]]
+// CHECK-DAG:   [[USE_C:%.+]] = load i16*, i16** [[LOCALX_C]]
+// CHECK-DAG:   [[USE_VLA1:%.+]] = load i64*, i64** [[LOCALX_VLA1]]
+// CHECK-DAG:   [[USE_VLA2:%.+]] = load i64*, i64** [[LOCALX_VLA2]]
+// CHECK-DAG:   getelementptr inbounds [[S1]], [[S1]]* [[USE_THIS]], i{{.*}} 0, i{{.*}} 0
+// CHECK-DAG:   load i32, i32* [[USE_B]]
+// CHECK-DAG:   [[VLAMUL:%.+]] = mul {{.*}}i64 1, %{{.*}}
+// CHECK-DAG:   getelementptr inbounds i16, i16* [[USE_C]], i{{.*}} [[VLAMUL]]
+// CHECK-DAG:   load i64, i64* [[USE_VLA1]]
+// CHECK-DAG:   load i64, i64* [[USE_VLA2]]
+
+// CHECK:       define internal void [[HVT6]]
+// CHECK-DAG:   [[LOCALX_A:%.+]] = alloca i32*
+// CHECK-DAG:   [[LOCALX_AA:%.+]] = alloca i16*
+// CHECK-DAG:   [[LOCALX_AAA:%.+]] = alloca i8*
+// CHECK-DAG:   [[LOCALX_B:%.+]] = alloca [10 x i32]*
+// CHECK-DAG:   store i32* [[ARG_A:%.+]], i32** [[LOCALX_A]]
+// CHECK-DAG:   store i16* [[ARG_AA:%.+]], i16** [[LOCALX_AA]]
+// CHECK-DAG:   store i8* [[ARG_AAA:%.+]], i8** [[LOCALX_AAA]]
+// CHECK-DAG:   store [10 x i32]* [[ARG_B:%.+]], [10 x i32]** [[LOCALX_B]]
+// CHECK-DAG:   [[USE_A:%.+]] = load i32*, i32** [[LOCALX_A]]
+// CHECK-DAG:   [[USE_AA:%.+]] = load i16*, i16** [[LOCALX_AA]]
+// CHECK-DAG:   [[USE_AAA:%.+]] = load i8*, i8** [[LOCALX_AAA]]
+// CHECK-DAG:   [[USE_B:%.+]] = load [10 x i32]*, [10 x i32]** [[LOCALX_B]]
+// CHECK-DAG:   load i32, i32* [[USE_A]]
+// CHECK-DAG:   load i16, i16* [[USE_AA]]
+// CHECK-DAG:   load i8, i8* [[USE_AAA]]
+// CHECK-DAG:   getelementptr inbounds [10 x i32], [10 x i32]* [[USE_B]], i{{.*}} 0, i{{.*}} 2
+
+// CHECK:       define internal void [[HVT5]]
+// CHECK-DAG:   [[LOCALX_A:%.+]] = alloca i32*
+// CHECK-DAG:   [[LOCALX_AA:%.+]] = alloca i16*
+// CHECK-DAG:   [[LOCALX_B:%.+]] = alloca [10 x i32]*
+// CHECK-DAG:   store i32* [[ARG_A:%.+]], i32** [[LOCALX_A]]
+// CHECK-DAG:   store i16* [[ARG_AA:%.+]], i16** [[LOCALX_AA]]
+// CHECK-DAG:   store [10 x i32]* [[ARG_B:%.+]], [10 x i32]** [[LOCALX_B]]
+// CHECK-DAG:   [[USE_A:%.+]] = load i32*, i32** [[LOCALX_A]]
+// CHECK-DAG:   [[USE_AA:%.+]] = load i16*, i16** [[LOCALX_AA]]
+// CHECK-DAG:   [[USE_B:%.+]] = load [10 x i32]*, [10 x i32]** [[LOCALX_B]]
+// CHECK-DAG:   load i32, i32* [[USE_A]]
+// CHECK-DAG:   load i16, i16* [[USE_AA]]
+// CHECK-DAG:   getelementptr inbounds [10 x i32], [10 x i32]* [[USE_B]], i{{.*}} 0, i{{.*}} 2
+#endif
Index: lib/CodeGen/CodeGenFunction.h
===================================================================
--- lib/CodeGen/CodeGenFunction.h
+++ lib/CodeGen/CodeGenFunction.h
@@ -1253,13 +1253,14 @@
   /// \brief Emit code for the start of a function.
   /// \param Loc       The location to be associated with the function.
   /// \param StartLoc  The location of the function body.
-  void StartFunction(GlobalDecl GD,
-                     QualType RetTy,
-                     llvm::Function *Fn,
-                     const CGFunctionInfo &FnInfo,
-                     const FunctionArgList &Args,
+  /// \param StartLoc  The location of the function body.
+  /// \param OffloadingCaptureStmt  The capture statement associated with
+  /// offloading function, if any
+  void StartFunction(GlobalDecl GD, QualType RetTy, llvm::Function *Fn,
+                     const CGFunctionInfo &FnInfo, const FunctionArgList &Args,
                      SourceLocation Loc = SourceLocation(),
-                     SourceLocation StartLoc = SourceLocation());
+                     SourceLocation StartLoc = SourceLocation(),
+                     const CapturedStmt *OffloadingCaptureStmt = nullptr);
 
   void EmitConstructorBody(FunctionArgList &Args);
   void EmitDestructorBody(FunctionArgList &Args);
@@ -1688,6 +1689,10 @@
   std::pair<llvm::Value*,QualType> getVLASize(const VariableArrayType *vla);
   std::pair<llvm::Value*,QualType> getVLASize(QualType vla);
 
+  /// getVLASizeMap - Returns an LLVM value that corresponds to the expression
+  /// \a E that should be associated with a VLA type.
+  llvm::Value *getVLASizeMap(const Expr *E);
+
   /// LoadCXXThis - Load the value of 'this'. This function is only valid while
   /// generating code for an C++ member function.
   llvm::Value *LoadCXXThis() {
@@ -2088,7 +2093,8 @@
   llvm::Function *EmitCapturedStmt(const CapturedStmt &S, CapturedRegionKind K);
   void GenerateCapturedStmtFunctionProlog(const CapturedStmt &S);
   llvm::Function *GenerateCapturedStmtFunctionEpilog(const CapturedStmt &S);
-  llvm::Function *GenerateCapturedStmtFunction(const CapturedStmt &S);
+  llvm::Function *GenerateCapturedStmtFunction(const CapturedStmt &S,
+                                               bool isOffloadFunction = false);
   llvm::Value *GenerateCapturedStmtArgument(const CapturedStmt &S);
   /// \brief Perform element by element copying of arrays with type \a
   /// OriginalType from \a SrcAddr to \a DestAddr using copying procedure
Index: lib/CodeGen/CodeGenFunction.cpp
===================================================================
--- lib/CodeGen/CodeGenFunction.cpp
+++ lib/CodeGen/CodeGenFunction.cpp
@@ -585,13 +585,12 @@
   return false;
 }
 
-void CodeGenFunction::StartFunction(GlobalDecl GD,
-                                    QualType RetTy,
+void CodeGenFunction::StartFunction(GlobalDecl GD, QualType RetTy,
                                     llvm::Function *Fn,
                                     const CGFunctionInfo &FnInfo,
                                     const FunctionArgList &Args,
-                                    SourceLocation Loc,
-                                    SourceLocation StartLoc) {
+                                    SourceLocation Loc, SourceLocation StartLoc,
+                                    const CapturedStmt *OffloadingCaptureStmt) {
   assert(!CurFn &&
          "Do not use a CodeGenFunction object for more than one function");
 
@@ -734,6 +733,41 @@
   PrologueCleanupDepth = EHStack.stable_begin();
   EmitFunctionProlog(*CurFnInfo, CurFn, Args);
 
+  // Emit code required for the offloading function, if any.
+  if (OffloadingCaptureStmt) {
+    auto ai = Args.begin();
+    auto ri = OffloadingCaptureStmt->getCapturedRecordDecl()->field_begin();
+    for (CapturedStmt::const_capture_iterator
+             ci = OffloadingCaptureStmt->capture_begin(),
+             ce = OffloadingCaptureStmt->capture_end();
+         ci != ce; ++ci, ++ai, ++ri) {
+
+      // Obtain the llvm value associated with teh current function argument.
+      llvm::Value *V = LocalDeclMap[*ai];
+      assert(V && "Local value for offloading function argument not found!");
+
+      LValue Addr =
+          LValue::MakeAddr(V, ri->getType(), CharUnits(), CGM.getContext());
+      V = EmitLoadOfLValue(Addr, OffloadingCaptureStmt->getLocStart())
+              .getScalarVal();
+
+      if (ci->capturesVariableArrayType()) {
+        auto VAT = ri->getCapturedVLAType();
+        LValue Addr =
+            LValue::MakeAddr(V, ri->getType(), CharUnits(), CGM.getContext());
+        VLASizeMap[VAT->getSizeExpr()] =
+            EmitLoadOfLValue(Addr, OffloadingCaptureStmt->getLocStart())
+                .getScalarVal();
+        continue;
+      }
+      if (ci->capturesThis()) {
+        CXXThisValue = V;
+        continue;
+      }
+      LocalDeclMap[ci->getCapturedVar()] = V;
+    }
+  }
+
   if (D && isa<CXXMethodDecl>(D) && cast<CXXMethodDecl>(D)->isInstance()) {
     CGM.getCXXABI().EmitInstanceFunctionProlog(*this);
     const CXXMethodDecl *MD = cast<CXXMethodDecl>(D);
@@ -1509,6 +1543,11 @@
 
   return std::pair<llvm::Value*,QualType>(numElements, elementType);
 }
+llvm::Value *CodeGenFunction::getVLASizeMap(const Expr *E) {
+  llvm::Value *vlaSize = VLASizeMap[E];
+  assert(vlaSize && "No vla size availabel to the requested expression!");
+  return vlaSize;
+}
 
 void CodeGenFunction::EmitVariablyModifiedType(QualType type) {
   assert(type->isVariablyModifiedType() &&
Index: lib/CodeGen/CGStmtOpenMP.cpp
===================================================================
--- lib/CodeGen/CGStmtOpenMP.cpp
+++ lib/CodeGen/CGStmtOpenMP.cpp
@@ -2093,8 +2093,25 @@
   CGM.getOpenMPRuntime().emitInlinedDirective(*this, OMPD_atomic, CodeGen);
 }
 
-void CodeGenFunction::EmitOMPTargetDirective(const OMPTargetDirective &) {
-  llvm_unreachable("CodeGen for 'omp target' is not supported yet.");
+void CodeGenFunction::EmitOMPTargetDirective(const OMPTargetDirective &S) {
+  LexicalScope Scope(*this, S.getSourceRange());
+
+  // Emit target region as a standalone region.
+  auto &&CodeGen = [&S](CodeGenFunction &CGF) {
+    CGF.EmitStmt(cast<CapturedStmt>(S.getAssociatedStmt())->getCapturedStmt());
+  };
+
+  // Obtain the target region outlined function.
+  llvm::Value *Fn =
+      CGM.getOpenMPRuntime().emitTargetOutlinedFunction(*this, S, CodeGen);
+
+  // Check if we have any if clause associated with the directive
+  const Expr *IfCond = nullptr;
+  if (auto C = S.getSingleClause(OMPC_if)) {
+    IfCond = cast<OMPIfClause>(C)->getCondition();
+  }
+
+  CGM.getOpenMPRuntime().emitTargetCall(*this, S, Fn, IfCond);
 }
 
 void CodeGenFunction::EmitOMPTeamsDirective(const OMPTeamsDirective &) {
Index: lib/CodeGen/CGStmt.cpp
===================================================================
--- lib/CodeGen/CGStmt.cpp
+++ lib/CodeGen/CGStmt.cpp
@@ -2197,7 +2197,8 @@
 
 /// Creates the outlined function for a CapturedStmt.
 llvm::Function *
-CodeGenFunction::GenerateCapturedStmtFunction(const CapturedStmt &S) {
+CodeGenFunction::GenerateCapturedStmtFunction(const CapturedStmt &S,
+                                              bool isOffloadFunction) {
   assert(CapturedStmtInfo &&
     "CapturedStmtInfo should be set when generating the captured function");
   const CapturedDecl *CD = S.getCapturedDecl();
@@ -2208,7 +2209,36 @@
   // Build the argument list.
   ASTContext &Ctx = CGM.getContext();
   FunctionArgList Args;
-  Args.append(CD->param_begin(), CD->param_end());
+
+  // If this is an offload function, we need pass a reference to each captured
+  // declarations as arguments.
+  if (isOffloadFunction) {
+    DeclContext *DC = CapturedDecl::castToDeclContext(CD)->getParent();
+    auto ri = RD->field_begin();
+    for (CapturedStmt::const_capture_iterator ci = S.capture_begin(),
+                                              ce = S.capture_end();
+         ci != ce; ++ci, ++ri) {
+      StringRef Name;
+      QualType Ty;
+      if (ci->capturesVariableArrayType()) {
+        Ty = Ctx.getPointerType(ri->getType());
+        Name = "__vla_size";
+      } else if (ci->capturesThis()) {
+        Ty = ri->getType();
+        Name = "__this";
+      } else {
+        const VarDecl *VD = ci->getCapturedVar();
+        Ty = Ctx.getPointerType(VD->getType());
+        Name = VD->getName();
+      }
+
+      IdentifierInfo *ParamName = &Ctx.Idents.get(Name);
+      ImplicitParamDecl *Param =
+          ImplicitParamDecl::Create(Ctx, DC, Loc, ParamName, Ty);
+      Args.push_back(Param);
+    }
+  } else
+    Args.append(CD->param_begin(), CD->param_end());
 
   // Create the function declaration.
   FunctionType::ExtInfo ExtInfo;
@@ -2225,31 +2255,36 @@
     F->addFnAttr(llvm::Attribute::NoUnwind);
 
   // Generate the function.
-  StartFunction(CD, Ctx.VoidTy, F, FuncInfo, Args,
-                CD->getLocation(),
-                CD->getBody()->getLocStart());
-  // Set the context parameter in CapturedStmtInfo.
-  llvm::Value *DeclPtr = LocalDeclMap[CD->getContextParam()];
-  assert(DeclPtr && "missing context parameter for CapturedStmt");
-  CapturedStmtInfo->setContextValue(Builder.CreateLoad(DeclPtr));
-
-  // Initialize variable-length arrays.
-  LValue Base = MakeNaturalAlignAddrLValue(CapturedStmtInfo->getContextValue(),
-                                           Ctx.getTagDeclType(RD));
-  for (auto *FD : RD->fields()) {
-    if (FD->hasCapturedVLAType()) {
-      auto *ExprArg = EmitLoadOfLValue(EmitLValueForField(Base, FD),
-                                       S.getLocStart()).getScalarVal();
-      auto VAT = FD->getCapturedVLAType();
-      VLASizeMap[VAT->getSizeExpr()] = ExprArg;
+  StartFunction(CD, Ctx.VoidTy, F, FuncInfo, Args, CD->getLocation(),
+                CD->getBody()->getLocStart(), isOffloadFunction ? &S : nullptr);
+
+  // If this is an offloading function, 'VLAs' and 'this' were already dealt
+  // with in StartFunction().
+  if (!isOffloadFunction) {
+    // Set the context parameter in CapturedStmtInfo.
+    llvm::Value *DeclPtr = LocalDeclMap[CD->getContextParam()];
+    assert(DeclPtr && "missing context parameter for CapturedStmt");
+    CapturedStmtInfo->setContextValue(Builder.CreateLoad(DeclPtr));
+
+    // Initialize variable-length arrays.
+    LValue Base = MakeNaturalAlignAddrLValue(
+        CapturedStmtInfo->getContextValue(), Ctx.getTagDeclType(RD));
+    for (auto *FD : RD->fields()) {
+      if (FD->hasCapturedVLAType()) {
+        auto *ExprArg =
+            EmitLoadOfLValue(EmitLValueForField(Base, FD), S.getLocStart())
+                .getScalarVal();
+        auto VAT = FD->getCapturedVLAType();
+        VLASizeMap[VAT->getSizeExpr()] = ExprArg;
+      }
     }
-  }
 
-  // If 'this' is captured, load it into CXXThisValue.
-  if (CapturedStmtInfo->isCXXThisExprCaptured()) {
-    FieldDecl *FD = CapturedStmtInfo->getThisFieldDecl();
-    LValue ThisLValue = EmitLValueForField(Base, FD);
-    CXXThisValue = EmitLoadOfLValue(ThisLValue, Loc).getScalarVal();
+    // If 'this' is captured, load it into CXXThisValue.
+    if (CapturedStmtInfo->isCXXThisExprCaptured()) {
+      FieldDecl *FD = CapturedStmtInfo->getThisFieldDecl();
+      LValue ThisLValue = EmitLValueForField(Base, FD);
+      CXXThisValue = EmitLoadOfLValue(ThisLValue, Loc).getScalarVal();
+    }
   }
 
   PGO.assignRegionCounters(CD, F);
Index: lib/CodeGen/CGOpenMPRuntime.h
===================================================================
--- lib/CodeGen/CGOpenMPRuntime.h
+++ lib/CodeGen/CGOpenMPRuntime.h
@@ -154,6 +154,14 @@
     // Call to kmp_int32 __kmpc_cancel(ident_t *loc, kmp_int32 global_tid,
     // kmp_int32 cncl_kind);
     OMPRTL__kmpc_cancel,
+
+    //
+    // Offloading related calls
+    //
+    // Call to int32_t __tgt_target(int32_t device_id, void *host_ptr, int32_t
+    // arg_num, void** args_base, void **args, int64_t *arg_sizes, int32_t
+    // *arg_types);
+    OMPRTL__tgt_target,
   };
 
   /// \brief Values for bit flags used in the ident_t to describe the fields.
@@ -177,6 +185,31 @@
     /// \brief Implicit barrier in 'single' directive.
     OMP_IDENT_BARRIER_IMPL_SINGLE = 0x140
   };
+
+  /// \brief Values for bit flags used to specify the mapping type for
+  /// offloading.
+  enum OpenMPOffloadMappingFlags {
+    /// \brief Only allocate memory on the device without moving any data. This
+    /// is the default if no other flags are specified.
+    OMP_MAP_ALLOC = 0x00,
+    /// \brief Allocate memory on the device and move data from host to device.
+    OMP_MAP_TO = 0x01,
+    /// \brief Allocate memory on the device and move data from device to host.
+    OMP_MAP_FROM = 0x02,
+  };
+
+  enum OpenMPOffloadingReservedDeviceIDs {
+    /// \brief Device ID if the device was not defined, runtime should get it
+    /// from environment variables in the spec.
+    OMP_DEVICEID_UNDEF = -1,
+    /// \brief Means the target region should be executed by all devices before
+    /// a regular target region atemps to run on them. Used for Ctors.
+    OMP_DEVICEID_CTORS = -2,
+    /// \brief Means target all devices that were used in the current shared
+    /// library. Used for Dtors.
+    OMP_DEVICEID_DTORS = -3
+  };
+
   CodeGenModule &CGM;
   /// \brief Default const ident_t object used for initialization of all other
   /// ident_t objects.
@@ -707,6 +740,25 @@
   ///
   virtual void emitCancelCall(CodeGenFunction &CGF, SourceLocation Loc,
                               OpenMPDirectiveKind CancelRegion);
+
+  /// \brief Emit outilined function for 'target' directive.
+  /// \param D Directive to emit.
+  /// \param CodeGen Code generation sequence for the \a D directive.
+  virtual llvm::Value *
+  emitTargetOutlinedFunction(CodeGenFunction &CGF,
+                             const OMPExecutableDirective &D,
+                             const RegionCodeGenTy &CodeGen);
+
+  /// \brief Emit the target offloading code associated with \a D. The emitted
+  /// code attempts offloading the execution to the device, an the event of
+  /// a failure it executes the host version outlined in \a OutlinedFn.
+  /// \param D Directive to emit.
+  /// \param OutlinedFn Host version of the code to be offloaded.
+  /// \param IfCond Expression evaluated in if clause associated with the target
+  /// directive, or null if no if clause is used.
+  virtual void emitTargetCall(CodeGenFunction &CGF,
+                              const OMPExecutableDirective &D,
+                              llvm::Value *OutlinedFn, const Expr *IfCond);
 };
 
 } // namespace CodeGen
Index: lib/CodeGen/CGOpenMPRuntime.cpp
===================================================================
--- lib/CodeGen/CGOpenMPRuntime.cpp
+++ lib/CodeGen/CGOpenMPRuntime.cpp
@@ -41,6 +41,8 @@
     /// \brief Region for constructs that do not require function outlining,
     /// like 'for', 'sections', 'atomic' etc. directives.
     InlinedRegion,
+    /// \brief Region with outlined function for standalone 'target' directive.
+    TargetRegion,
   };
 
   CGOpenMPRegionInfo(const CapturedStmt &CS,
@@ -204,6 +206,26 @@
   CGOpenMPRegionInfo *OuterRegionInfo;
 };
 
+/// \brief API for captured statement code generation in OpenMP target
+/// constructs.
+class CGOpenMPTargetRegionInfo : public CGOpenMPRegionInfo {
+public:
+  CGOpenMPTargetRegionInfo(const RegionCodeGenTy &CodeGen)
+      : CGOpenMPRegionInfo(TargetRegion, CodeGen, OMPD_target) {}
+
+  /// \brief This is unused for target regions because each starts executing
+  /// with a single thread.
+  const VarDecl *getThreadIDVariable() const override { return nullptr; }
+
+  /// \brief Get the name of the capture helper.
+  StringRef getHelperName() const override { return ".omp_offloading."; }
+
+  static bool classof(const CGCapturedStmtInfo *Info) {
+    return CGOpenMPRegionInfo::classof(Info) &&
+           cast<CGOpenMPRegionInfo>(Info)->getRegionKind() == TargetRegion;
+  }
+};
+
 /// \brief RAII for emitting code of OpenMP constructs.
 class InlinedOpenMPRegionRAII {
   CodeGenFunction &CGF;
@@ -838,6 +860,22 @@
     RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_cancel");
     break;
   }
+  case OMPRTL__tgt_target: {
+    // Build to int32_t __tgt_target(int32_t device_id, void *host_ptr, int32_t
+    // arg_num, void** args_base, void **args, int64_t *arg_sizes, int32_t
+    // *arg_types);
+    llvm::Type *TypeParams[] = {CGM.Int32Ty,
+                                CGM.VoidPtrTy,
+                                CGM.Int32Ty,
+                                CGM.VoidPtrPtrTy,
+                                CGM.VoidPtrPtrTy,
+                                CGM.Int64Ty->getPointerTo(),
+                                CGM.Int32Ty->getPointerTo()};
+    llvm::FunctionType *FnTy =
+        llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false);
+    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__tgt_target");
+    break;
+  }
   }
   return RTLFn;
 }
@@ -2834,3 +2872,209 @@
   }
 }
 
+llvm::Value *
+CGOpenMPRuntime::emitTargetOutlinedFunction(CodeGenFunction &CGF,
+                                            const OMPExecutableDirective &D,
+                                            const RegionCodeGenTy &CodeGen) {
+
+  const CapturedStmt *CS = cast<CapturedStmt>(D.getAssociatedStmt());
+
+  // Generate the outlined target offloading function.
+  CodeGenFunction TargetCGF(CGM, true);
+  CGOpenMPTargetRegionInfo CGInfo(CodeGen);
+  CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(TargetCGF, &CGInfo);
+  return TargetCGF.GenerateCapturedStmtFunction(*CS, true);
+}
+
+void CGOpenMPRuntime::emitTargetCall(CodeGenFunction &CGF,
+                                     const OMPExecutableDirective &D,
+                                     llvm::Value *OutlinedFn,
+                                     const Expr *IfCond) {
+
+  // Fill up the arrays with the all the captured variables.
+  SmallVector<llvm::Value *, 16> BasePointers;
+  SmallVector<llvm::Value *, 16> Pointers;
+  SmallVector<llvm::Value *, 16> Sizes;
+  SmallVector<unsigned, 16> MapTypes;
+
+  const CapturedStmt &CS = *cast<CapturedStmt>(D.getAssociatedStmt());
+  auto ri = CS.getCapturedRecordDecl()->field_begin();
+  auto ii = CS.capture_init_begin();
+  for (CapturedStmt::const_capture_iterator ci = CS.capture_begin(),
+                                            ce = CS.capture_end();
+       ci != ce; ++ci, ++ri, ++ii) {
+    StringRef Name;
+    QualType Ty;
+    llvm::Value *BasePointer;
+    llvm::Value *Pointer;
+    llvm::Value *Size;
+    unsigned MapType;
+
+    if (ci->capturesVariableArrayType()) {
+      llvm::Value *V =
+          CGF.getVLASizeMap(ri->getCapturedVLAType()->getSizeExpr());
+      LValue LV = CGF.MakeNaturalAlignAddrLValue(
+          CGF.CreateMemTemp(ri->getType(), "__vla_size"), ri->getType());
+      CGF.EmitStoreThroughLValue(RValue::get(V), LV);
+      BasePointer = Pointer = LV.getAddress();
+      uint64_t SizeVal =
+          CGM.getDataLayout().getTypeSizeInBits(V->getType()) / 8;
+      Size = CGF.Builder.getInt64(SizeVal);
+
+      // VLA sizes don't need to be copied back from the device.
+      MapType = CGOpenMPRuntime::OMP_MAP_TO;
+    } else if (ci->capturesThis()) {
+      BasePointer = Pointer = CGF.LoadCXXThis();
+      llvm::PointerType *PtrTy = cast<llvm::PointerType>(Pointer->getType());
+      uint64_t SizeVal =
+          CGM.getDataLayout().getTypeSizeInBits(PtrTy->getElementType()) / 8;
+      Size = CGF.Builder.getInt64(SizeVal);
+
+      // Default map type.
+      MapType = CGOpenMPRuntime::OMP_MAP_TO | CGOpenMPRuntime::OMP_MAP_FROM;
+    } else {
+      BasePointer = Pointer =
+          CGF.EmitLValue(cast<DeclRefExpr>(*ii)).getAddress();
+      llvm::PointerType *PtrTy = cast<llvm::PointerType>(Pointer->getType());
+      uint64_t SizeVal =
+          CGM.getDataLayout().getTypeSizeInBits(PtrTy->getElementType()) / 8;
+      Size = CGF.Builder.getInt64(SizeVal);
+
+      // Default map type.
+      MapType = CGOpenMPRuntime::OMP_MAP_TO | CGOpenMPRuntime::OMP_MAP_FROM;
+    }
+
+    BasePointers.push_back(BasePointer);
+    Pointers.push_back(Pointer);
+    Sizes.push_back(Size);
+    MapTypes.push_back(MapType);
+  }
+
+  if (IfCond) {
+    // Check if the if clause conditional always evaluates to true or false.
+    // If it evaluates to false, we only need to emit the host version of the
+    // target region. If it evaluates to true, we can proceed with the codegen
+    // as if no if clause was provided.
+    bool CondConstant;
+    if (CGF.ConstantFoldsToSimpleInteger(IfCond, CondConstant)) {
+      if (CondConstant) {
+        IfCond = nullptr;
+      } else {
+        CGF.Builder.CreateCall(OutlinedFn, BasePointers);
+        return;
+      }
+    }
+  }
+
+  // Generate the code to launch the target region. The pattern is the
+  // following:
+  //
+  //   ...
+  //   br IfCond (if any), omp_offload, omp_offload_fail
+  //
+  // omp_offload.try:
+  //   ; create arrays for offloading
+  //   error = __tgt_target(...)
+  //   br error, omp_offload_fail, omp_offload_end
+  //
+  // omp_offload.fail:
+  //   host_version(...)
+  //
+  // omp_offload.end:
+  //   ...
+  //
+
+  auto OffloadTryBlock = CGF.createBasicBlock("omp_offload.try");
+  auto OffloadFailBlock = CGF.createBasicBlock("omp_offload.fail");
+  auto ContBlock = CGF.createBasicBlock("omp_offload.end");
+
+  if (IfCond)
+    CGF.EmitBranchOnBoolExpr(IfCond, OffloadTryBlock, OffloadFailBlock,
+                             /*TrueCount=*/0);
+
+  CGF.EmitBlock(OffloadTryBlock);
+
+  llvm::Value *PointerNum = CGF.Builder.getInt32(BasePointers.size());
+  llvm::Value *BasePointersArray;
+  llvm::Value *PointersArray;
+  llvm::Value *SizesArray;
+  llvm::Value *MapTypesArray;
+
+  if (BasePointers.size()) {
+    BasePointersArray = CGF.Builder.CreateAlloca(CGM.VoidPtrTy, PointerNum,
+                                                 ".offload_baseptrs");
+    PointersArray =
+        CGF.Builder.CreateAlloca(CGM.VoidPtrTy, PointerNum, ".offload_ptrs");
+    SizesArray =
+        CGF.Builder.CreateAlloca(CGM.Int64Ty, PointerNum, ".offload_sizes");
+
+    // The map sizes are always constant so we don't need to generate code to
+    // fill arrays. Instead, we create an array constant.
+    llvm::Constant *MapTypesArrayInit =
+        llvm::ConstantDataArray::get(CGF.Builder.getContext(), MapTypes);
+    MapTypesArray =
+        new llvm::GlobalVariable(CGM.getModule(), MapTypesArrayInit->getType(),
+                                 true, llvm::GlobalValue::PrivateLinkage,
+                                 MapTypesArrayInit, ".offload_maptypes");
+    MapTypesArray = CGF.Builder.CreateConstGEP2_32(MapTypesArrayInit->getType(),
+                                                   MapTypesArray, 0, 0);
+
+    for (unsigned i = 0; i < BasePointers.size(); ++i) {
+
+      llvm::PointerType *BPPtrTy =
+          cast<llvm::PointerType>(BasePointersArray->getType());
+      llvm::Value *BP = CGF.Builder.CreateConstInBoundsGEP1_32(
+          BPPtrTy->getElementType(), BasePointersArray, i);
+
+      llvm::PointerType *PPtrTy =
+          cast<llvm::PointerType>(PointersArray->getType());
+      llvm::Value *P = CGF.Builder.CreateConstInBoundsGEP1_32(
+          PPtrTy->getElementType(), PointersArray, i);
+
+      llvm::PointerType *SPtrTy =
+          cast<llvm::PointerType>(SizesArray->getType());
+      llvm::Value *S = CGF.Builder.CreateConstInBoundsGEP1_32(
+          SPtrTy->getElementType(), SizesArray, i);
+
+      CGF.Builder.CreateStore(
+          CGF.Builder.CreateBitCast(BasePointers[i], CGM.VoidPtrTy), BP);
+      CGF.Builder.CreateStore(
+          CGF.Builder.CreateBitCast(Pointers[i], CGM.VoidPtrTy), P);
+      CGF.Builder.CreateStore(
+          CGF.Builder.CreateIntCast(Sizes[i], CGM.Int64Ty, true), S);
+    }
+  } else {
+    BasePointersArray = llvm::Constant::getNullValue(CGM.VoidPtrPtrTy);
+    PointersArray = llvm::Constant::getNullValue(CGM.VoidPtrPtrTy);
+    SizesArray = llvm::Constant::getNullValue(CGM.Int64Ty->getPointerTo());
+    MapTypesArray = llvm::Constant::getNullValue(CGM.Int32Ty->getPointerTo());
+  }
+
+  // On top of the arrays that were filled up, the target offloading call takes
+  // as arguments the device id as well as the host pointer. The host pointer
+  // is used by the runtime library to identify the current target region, so
+  // it only has to be unique and not necessarily point to anything. It could be
+  // the pointer to the outlined function that implements the target region, but
+  // we aren't using that so that the compiler doesn't need to keep that, and
+  // could therefore inline the host function if proven worthwhile during
+  // optimization.
+
+  // FIXME: Obtain device ID from the device clause when it becomes supported.
+  llvm::Value *OffloadingArgs[] = {
+      CGF.Builder.getInt32(OMP_DEVICEID_UNDEF),
+      new llvm::GlobalVariable(
+          CGM.getModule(), CGM.Int8Ty, true, llvm::GlobalValue::PrivateLinkage,
+          llvm::Constant::getNullValue(CGM.Int8Ty), ".offload_hstptr"),
+      PointerNum, BasePointersArray, PointersArray, SizesArray, MapTypesArray};
+  auto Return = CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__tgt_target),
+                                    OffloadingArgs);
+  auto Error = CGF.Builder.CreateICmpNE(Return, CGF.Builder.getInt32(0));
+  CGF.Builder.CreateCondBr(Error, OffloadFailBlock, ContBlock);
+
+  CGF.EmitBlock(OffloadFailBlock);
+  CGF.Builder.CreateCall(OutlinedFn, BasePointers);
+  CGF.EmitBranch(ContBlock);
+
+  CGF.EmitBlock(ContBlock, /*IsFinished=*/true);
+  return;
+}
_______________________________________________
cfe-commits mailing list
cfe-commits@cs.uiuc.edu
http://lists.cs.uiuc.edu/mailman/listinfo/cfe-commits

Reply via email to