[PATCH] D86558: [OPENMP]Add support for allocate vars in untied tasks.

2020-09-15 Thread Alexey Bataev via Phabricator via cfe-commits
This revision was automatically updated to reflect the committed changes.
Closed by commit rG738bab743b5c: [OPENMP]Add support for allocate vars in 
untied tasks. (authored by ABataev).

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D86558/new/

https://reviews.llvm.org/D86558

Files:
  clang/lib/CodeGen/CGOpenMPRuntime.cpp
  clang/lib/CodeGen/CGOpenMPRuntime.h
  clang/lib/CodeGen/CGStmtOpenMP.cpp
  clang/test/OpenMP/allocate_codegen.cpp
  clang/test/OpenMP/for_lastprivate_codegen.cpp
  clang/test/OpenMP/for_linear_codegen.cpp
  clang/test/OpenMP/for_reduction_codegen_UDR.cpp
  clang/test/OpenMP/parallel_firstprivate_codegen.cpp
  clang/test/OpenMP/parallel_private_codegen.cpp
  clang/test/OpenMP/task_codegen.cpp

Index: clang/test/OpenMP/task_codegen.cpp
===
--- clang/test/OpenMP/task_codegen.cpp
+++ clang/test/OpenMP/task_codegen.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -x c++ -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix UNTIEDRT
-// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -x c++ -emit-llvm %s -o - -DUNTIEDRT | FileCheck %s --check-prefix CHECK --check-prefix UNTIEDRT
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s -DUNTIEDRT
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix UNTIEDRT
 //
 // RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -fopenmp-enable-irbuilder -x c++ -emit-llvm %s -o - | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -fopenmp-enable-irbuilder -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s
@@ -14,6 +14,19 @@
 #ifndef HEADER
 #define HEADER
 
+enum omp_allocator_handle_t {
+  omp_null_allocator = 0,
+  omp_default_mem_alloc = 1,
+  omp_large_cap_mem_alloc = 2,
+  omp_const_mem_alloc = 3,
+  omp_high_bw_mem_alloc = 4,
+  omp_low_lat_mem_alloc = 5,
+  omp_cgroup_mem_alloc = 6,
+  omp_pteam_mem_alloc = 7,
+  omp_thread_mem_alloc = 8,
+  KMP_ALLOCATOR_MAX_HANDLE = __UINTPTR_MAX__
+};
+
 // CHECK-DAG: [[IDENT_T:%.+]] = type { i32, i32, i32, i32, i8* }
 // CHECK-DAG: [[STRUCT_SHAREDS:%.+]] = type { i8*, [2 x [[STRUCT_S:%.+]]]* }
 // CHECK-DAG: [[STRUCT_SHAREDS1:%.+]] = type { [2 x [[STRUCT_S:%.+]]]* }
@@ -258,21 +271,26 @@
 a = 4;
 c = 5;
   }
-// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{.+}}, i32 {{%.*}}, i32 0, i64 48, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_T]]{{.*}}*)* [[TASK_ENTRY6:@.+]] to i32 (i32, i8*)*))
+// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{.+}}, i32 {{%.*}}, i32 0, i64 256, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_T]]{{.*}}*)* [[TASK_ENTRY6:@.+]] to i32 (i32, i8*)*))
 // CHECK: call i32 @__kmpc_omp_task([[IDENT_T]]* @{{.+}}, i32 {{%.*}}, i8* [[ORIG_TASK_PTR]])
-#pragma omp task untied
+#pragma omp task untied firstprivate(c) allocate(omp_pteam_mem_alloc:c)
   {
-S s1;
+S s1, s2;
+#ifdef UNTIEDRT
+#pragma omp allocate(s2) allocator(omp_pteam_mem_alloc)
+#endif
+s2.a = 0;
 #pragma omp task
-a = 4;
+a = c = 4;
 #pragma omp taskyield
 s1 = S();
+s2.a = 10;
 #pragma omp taskwait
   }
   return a;
 }
 // CHECK: define internal i32 [[TASK_ENTRY1]](i32 %0, [[KMP_TASK_T]]{{.*}}* noalias %1)
-// CHECK: store i32 15, i32* [[A_PTR:@.+]]
+// CHECK: store i32 15, i32* [[A_PTR:@.+]],
 // CHECK: [[A_VAL:%.+]] = load i32, i32* [[A_PTR]]
 // CHECK: [[A_VAL_I8:%.+]] = trunc i32 [[A_VAL]] to i8
 // CHECK: store i8 [[A_VAL_I8]], i8* %{{.+}}
@@ -294,10 +312,13 @@
 // CHECK: define internal i32
 // CHECK: store i32 4, i32* [[A_PTR]]
 
-// CHECK: define internal i32 [[TASK_ENTRY6]](i32 %0, [[KMP_TASK_T]]{{.*}}* noalias %1)
+// CHECK: define internal i32 [[TASK_ENTRY6]](i32 %0, [[KMP_TASK_T]]{{.*}}* noalias %{{.+}})
 // UNTIEDRT: [[S1_ADDR_PTR:%.+]] = alloca %struct.S*,
-// UNTIEDRT: call void (i8*, ...) %{{.+}}(i8* %{{.+}}, %struct.S** [[S1_ADDR_PTR]])
-// UNTIEDRT: [[S1_ADDR:%.+]] = load %struct.S*, %struct.S** [[S1_ADDR_PTR]],
+// UNTIEDRT: [[S2_ADDR_PTR_REF:%.+]] = alloca %struct.S**,
+// UNTIEDRT: call void (i8*, ...) %{{.+}}(i8* %{{.+}}, %struct.S** [[S1_ADDR_PTR]], %struct.S*** [[S2_ADDR_PTR_REF]])
+// UNTIEDRT-DAG: [[S1_ADDR:%.+]] = load %struct.S*, %struct.S** [[S1_ADDR_PTR]],
+// UNTIEDRT-DAG: [[S2_ADDR_PTR:%.+]] = load %struct.S**, %struct.S*** [[S2_ADDR_PTR_REF]],
+// UNTIEDRT-DAG: [[S2_ADDR:%.+]] = load %struct.S*, %struct.S** [[S2_ADDR_PTR]],
 // CHECK: switch i32 %{{.+}}, label %[[DONE:.+]] [
 
 // CHECK: [[DONE]]:
@@ -309,16 +330,25 @@
 // UNTIEDRT: br label %[[EXIT:[^,]+]]
 
 // 

[PATCH] D86558: [OPENMP]Add support for allocate vars in untied tasks.

2020-09-09 Thread Johannes Doerfert via Phabricator via cfe-commits
jdoerfert accepted this revision.
jdoerfert added a comment.
This revision is now accepted and ready to land.

LGTM


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D86558/new/

https://reviews.llvm.org/D86558

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D86558: [OPENMP]Add support for allocate vars in untied tasks.

2020-08-28 Thread Alexey Bataev via Phabricator via cfe-commits
ABataev updated this revision to Diff 288711.
ABataev added a comment.
Herald added a subscriber: danielkiss.

Rebase


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D86558/new/

https://reviews.llvm.org/D86558

Files:
  clang/lib/CodeGen/CGOpenMPRuntime.cpp
  clang/lib/CodeGen/CGOpenMPRuntime.h
  clang/lib/CodeGen/CGStmtOpenMP.cpp
  clang/test/OpenMP/allocate_codegen.cpp
  clang/test/OpenMP/for_lastprivate_codegen.cpp
  clang/test/OpenMP/for_linear_codegen.cpp
  clang/test/OpenMP/for_reduction_codegen_UDR.cpp
  clang/test/OpenMP/parallel_firstprivate_codegen.cpp
  clang/test/OpenMP/parallel_private_codegen.cpp
  clang/test/OpenMP/task_codegen.cpp

Index: clang/test/OpenMP/task_codegen.cpp
===
--- clang/test/OpenMP/task_codegen.cpp
+++ clang/test/OpenMP/task_codegen.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -x c++ -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix UNTIEDRT
-// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -x c++ -emit-llvm %s -o - -DUNTIEDRT | FileCheck %s --check-prefix CHECK --check-prefix UNTIEDRT
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s -DUNTIEDRT
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix UNTIEDRT
 //
 // RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -fopenmp-enable-irbuilder -x c++ -emit-llvm %s -o - | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -fopenmp-enable-irbuilder -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s
@@ -14,6 +14,19 @@
 #ifndef HEADER
 #define HEADER
 
+enum omp_allocator_handle_t {
+  omp_null_allocator = 0,
+  omp_default_mem_alloc = 1,
+  omp_large_cap_mem_alloc = 2,
+  omp_const_mem_alloc = 3,
+  omp_high_bw_mem_alloc = 4,
+  omp_low_lat_mem_alloc = 5,
+  omp_cgroup_mem_alloc = 6,
+  omp_pteam_mem_alloc = 7,
+  omp_thread_mem_alloc = 8,
+  KMP_ALLOCATOR_MAX_HANDLE = __UINTPTR_MAX__
+};
+
 // CHECK-DAG: [[IDENT_T:%.+]] = type { i32, i32, i32, i32, i8* }
 // CHECK-DAG: [[STRUCT_SHAREDS:%.+]] = type { i8*, [2 x [[STRUCT_S:%.+]]]* }
 // CHECK-DAG: [[STRUCT_SHAREDS1:%.+]] = type { [2 x [[STRUCT_S:%.+]]]* }
@@ -258,21 +271,26 @@
 a = 4;
 c = 5;
   }
-// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{.+}}, i32 {{%.*}}, i32 0, i64 48, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_T]]{{.*}}*)* [[TASK_ENTRY6:@.+]] to i32 (i32, i8*)*))
+// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{.+}}, i32 {{%.*}}, i32 0, i64 256, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_T]]{{.*}}*)* [[TASK_ENTRY6:@.+]] to i32 (i32, i8*)*))
 // CHECK: call i32 @__kmpc_omp_task([[IDENT_T]]* @{{.+}}, i32 {{%.*}}, i8* [[ORIG_TASK_PTR]])
-#pragma omp task untied
+#pragma omp task untied firstprivate(c) allocate(omp_pteam_mem_alloc:c)
   {
-S s1;
+S s1, s2;
+#ifdef UNTIEDRT
+#pragma omp allocate(s2) allocator(omp_pteam_mem_alloc)
+#endif
+s2.a = 0;
 #pragma omp task
-a = 4;
+a = c = 4;
 #pragma omp taskyield
 s1 = S();
+s2.a = 10;
 #pragma omp taskwait
   }
   return a;
 }
 // CHECK: define internal i32 [[TASK_ENTRY1]](i32 %0, [[KMP_TASK_T]]{{.*}}* noalias %1)
-// CHECK: store i32 15, i32* [[A_PTR:@.+]]
+// CHECK: store i32 15, i32* [[A_PTR:@.+]],
 // CHECK: [[A_VAL:%.+]] = load i32, i32* [[A_PTR]]
 // CHECK: [[A_VAL_I8:%.+]] = trunc i32 [[A_VAL]] to i8
 // CHECK: store i8 [[A_VAL_I8]], i8* %{{.+}}
@@ -294,10 +312,13 @@
 // CHECK: define internal i32
 // CHECK: store i32 4, i32* [[A_PTR]]
 
-// CHECK: define internal i32 [[TASK_ENTRY6]](i32 %0, [[KMP_TASK_T]]{{.*}}* noalias %1)
+// CHECK: define internal i32 [[TASK_ENTRY6]](i32 %0, [[KMP_TASK_T]]{{.*}}* noalias %{{.+}})
 // UNTIEDRT: [[S1_ADDR_PTR:%.+]] = alloca %struct.S*,
-// UNTIEDRT: call void (i8*, ...) %{{.+}}(i8* %{{.+}}, %struct.S** [[S1_ADDR_PTR]])
-// UNTIEDRT: [[S1_ADDR:%.+]] = load %struct.S*, %struct.S** [[S1_ADDR_PTR]],
+// UNTIEDRT: [[S2_ADDR_PTR_REF:%.+]] = alloca %struct.S**,
+// UNTIEDRT: call void (i8*, ...) %{{.+}}(i8* %{{.+}}, %struct.S** [[S1_ADDR_PTR]], %struct.S*** [[S2_ADDR_PTR_REF]])
+// UNTIEDRT-DAG: [[S1_ADDR:%.+]] = load %struct.S*, %struct.S** [[S1_ADDR_PTR]],
+// UNTIEDRT-DAG: [[S2_ADDR_PTR:%.+]] = load %struct.S**, %struct.S*** [[S2_ADDR_PTR_REF]],
+// UNTIEDRT-DAG: [[S2_ADDR:%.+]] = load %struct.S*, %struct.S** [[S2_ADDR_PTR]],
 // CHECK: switch i32 %{{.+}}, label %[[DONE:.+]] [
 
 // CHECK: [[DONE]]:
@@ -309,16 +330,25 @@
 // UNTIEDRT: br label %[[EXIT:[^,]+]]
 
 // UNTIEDRT: call void [[CONSTR:@.+]](%struct.S* [[S1_ADDR]])
+// UNTI

[PATCH] D86558: [OPENMP]Add support for allocate vars in untied tasks.

2020-08-25 Thread Alexey Bataev via Phabricator via cfe-commits
ABataev created this revision.
ABataev added a reviewer: jdoerfert.
Herald added subscribers: guansong, yaxunl.
Herald added a project: clang.
ABataev requested review of this revision.
Herald added a subscriber: sstefan1.

Local vars, marked with pragma allocate, mustbe allocate by the call of
the runtime function and cannot be allocated as other local variables.
Instead, we allocate a space for the pointer in private record and store
the address, returned by kmpc_alloc call in this pointer.
So, for untied tasks

  #pragma omp task untied
  {
S s;
 #pragma omp allocate(s) allocator(allocator)
s = x;
  }

compiler generates something like this:

  struct task_with_privates {
S *ptr;
  };
  
  void entry(task_with_privates *p) {
S *s = p->s;
switch(partid) {
case 1:
  p->s = (S*)kmpc_alloc();
  kmpc_omp_task();
  br exit;
case 2:
  *s = x;
  kmpc_omp_task();
  br exit;
case 2:
  ~S(s);
  kmpc_free((void*)s);
  br exit;
}
  exit:
  }


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D86558

Files:
  clang/lib/CodeGen/CGOpenMPRuntime.cpp
  clang/lib/CodeGen/CGOpenMPRuntime.h
  clang/lib/CodeGen/CGStmtOpenMP.cpp
  clang/test/OpenMP/allocate_codegen.cpp
  clang/test/OpenMP/for_lastprivate_codegen.cpp
  clang/test/OpenMP/for_linear_codegen.cpp
  clang/test/OpenMP/for_reduction_codegen_UDR.cpp
  clang/test/OpenMP/parallel_firstprivate_codegen.cpp
  clang/test/OpenMP/parallel_private_codegen.cpp
  clang/test/OpenMP/task_codegen.cpp

Index: clang/test/OpenMP/task_codegen.cpp
===
--- clang/test/OpenMP/task_codegen.cpp
+++ clang/test/OpenMP/task_codegen.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -x c++ -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix UNTIEDRT
-// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -x c++ -emit-llvm %s -o - -DUNTIEDRT | FileCheck %s --check-prefix CHECK --check-prefix UNTIEDRT
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s -DUNTIEDRT
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix UNTIEDRT
 //
 // RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -fopenmp-enable-irbuilder -x c++ -emit-llvm %s -o - | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -fopenmp-enable-irbuilder -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s
@@ -14,6 +14,19 @@
 #ifndef HEADER
 #define HEADER
 
+enum omp_allocator_handle_t {
+  omp_null_allocator = 0,
+  omp_default_mem_alloc = 1,
+  omp_large_cap_mem_alloc = 2,
+  omp_const_mem_alloc = 3,
+  omp_high_bw_mem_alloc = 4,
+  omp_low_lat_mem_alloc = 5,
+  omp_cgroup_mem_alloc = 6,
+  omp_pteam_mem_alloc = 7,
+  omp_thread_mem_alloc = 8,
+  KMP_ALLOCATOR_MAX_HANDLE = __UINTPTR_MAX__
+};
+
 // CHECK-DAG: [[IDENT_T:%.+]] = type { i32, i32, i32, i32, i8* }
 // CHECK-DAG: [[STRUCT_SHAREDS:%.+]] = type { i8*, [2 x [[STRUCT_S:%.+]]]* }
 // CHECK-DAG: [[STRUCT_SHAREDS1:%.+]] = type { [2 x [[STRUCT_S:%.+]]]* }
@@ -258,21 +271,26 @@
 a = 4;
 c = 5;
   }
-// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{.+}}, i32 {{%.*}}, i32 0, i64 48, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_T]]{{.*}}*)* [[TASK_ENTRY6:@.+]] to i32 (i32, i8*)*))
+// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{.+}}, i32 {{%.*}}, i32 0, i64 256, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_T]]{{.*}}*)* [[TASK_ENTRY6:@.+]] to i32 (i32, i8*)*))
 // CHECK: call i32 @__kmpc_omp_task([[IDENT_T]]* @{{.+}}, i32 {{%.*}}, i8* [[ORIG_TASK_PTR]])
-#pragma omp task untied
+#pragma omp task untied firstprivate(c) allocate(omp_pteam_mem_alloc:c)
   {
-S s1;
+S s1, s2;
+#ifdef UNTIEDRT
+#pragma omp allocate(s2) allocator(omp_pteam_mem_alloc)
+#endif
+s2.a = 0;
 #pragma omp task
-a = 4;
+a = c = 4;
 #pragma omp taskyield
 s1 = S();
+s2.a = 10;
 #pragma omp taskwait
   }
   return a;
 }
 // CHECK: define internal i32 [[TASK_ENTRY1]](i32 %0, [[KMP_TASK_T]]{{.*}}* noalias %1)
-// CHECK: store i32 15, i32* [[A_PTR:@.+]]
+// CHECK: store i32 15, i32* [[A_PTR:@.+]],
 // CHECK: [[A_VAL:%.+]] = load i32, i32* [[A_PTR]]
 // CHECK: [[A_VAL_I8:%.+]] = trunc i32 [[A_VAL]] to i8
 // CHECK: store i8 [[A_VAL_I8]], i8* %{{.+}}
@@ -294,10 +312,13 @@
 // CHECK: define internal i32
 // CHECK: store i32 4, i32* [[A_PTR]]
 
-// CHECK: define internal i32 [[TASK_ENTRY6]](i32 %0, [[KMP_TASK_T]]{{.*}}* noalias %1)
+// CHECK: define internal i32 [[TASK_ENTRY6]](i32 %0, [[KMP_TASK_T]]{{.*}}* noalias %{{.+}})
 // UNTIEDRT: [[S1_ADDR_