[PATCH] D153883: [Clang][OpenMP] Delay emission of __kmpc_alloc_shared for escaped VLAs

2023-07-06 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
doru1004 closed this revision.
doru1004 added a comment.

Commit: 1370e568dea84c4ea65fe5c01ef4f4ccc751 



CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D153883/new/

https://reviews.llvm.org/D153883

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D153883: [Clang][OpenMP] Delay emission of __kmpc_alloc_shared for escaped VLAs

2023-07-06 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
doru1004 added a comment.

@ABataev thank you for the review! I have now fixed the last nit and will 
commit the patch soon!


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D153883/new/

https://reviews.llvm.org/D153883

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D153883: [Clang][OpenMP] Delay emission of __kmpc_alloc_shared for escaped VLAs

2023-07-06 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
doru1004 updated this revision to Diff 537706.

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D153883/new/

https://reviews.llvm.org/D153883

Files:
  clang/lib/CodeGen/CGDecl.cpp
  clang/lib/CodeGen/CGOpenMPRuntime.h
  clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
  clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
  clang/lib/CodeGen/CodeGenFunction.h
  clang/test/OpenMP/amdgcn_target_device_vla.cpp

Index: clang/test/OpenMP/amdgcn_target_device_vla.cpp
===
--- /dev/null
+++ clang/test/OpenMP/amdgcn_target_device_vla.cpp
@@ -0,0 +1,1260 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+int foo1() {
+  int sum = 0.0;
+  #pragma omp target map(tofrom: sum)
+  {
+int N = 10;
+int A[N];
+
+for (int i = 0; i < N; i++)
+  A[i] = i;
+
+for (int i = 0; i < N; i++)
+  sum += A[i];
+  }
+  return sum;
+}
+
+int foo2() {
+  int sum = 0.0;
+  int M = 12;
+  int result[M];
+  #pragma omp target teams distribute parallel for map(from: result[:M])
+  for (int i = 0; i < M; i++) {
+int N = 10;
+int A[N];
+result[i] = i;
+
+for (int j = 0; j < N; j++)
+  A[j] = j;
+
+for (int j = 0; j < N; j++)
+  result[i] += A[j];
+  }
+
+  for (int i = 0; i < M; i++)
+sum += result[i];
+  return sum;
+}
+
+int foo3() {
+  int sum = 0.0;
+  int M = 12;
+  int result[M];
+  #pragma omp target teams distribute map(from: result[:M])
+  for (int i = 0; i < M; i++) {
+int N = 10;
+int A[N];
+result[i] = i;
+
+#pragma omp parallel for
+for (int j = 0; j < N; j++)
+  A[j] = j;
+
+for (int j = 0; j < N; j++)
+  result[i] += A[j];
+  }
+
+  for (int i = 0; i < M; i++)
+sum += result[i];
+  return sum;
+}
+
+int foo4() {
+  int sum = 0.0;
+  int M = 12;
+  int result[M];
+  int N = 10;
+  #pragma omp target teams distribute map(from: result[:M])
+  for (int i = 0; i < M; i++) {
+int A[N];
+result[i] = i;
+
+#pragma omp parallel for
+for (int j = 0; j < N; j++)
+  A[j] = j;
+
+for (int j = 0; j < N; j++)
+  result[i] += A[j];
+  }
+
+  for (int i = 0; i < M; i++)
+sum += result[i];
+  return sum;
+}
+
+int main() {
+  return foo1() + foo2() + foo3() + foo4();
+}
+
+#endif
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo1v_l12
+// CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[SUM:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:[[SUM_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT:[[N:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:[[__VLA_EXPR0:%.*]] = alloca i64, align 8, addrspace(5)
+// CHECK-NEXT:[[I:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:[[I1:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:[[SUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR]] to ptr
+// CHECK-NEXT:[[N_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N]] to ptr
+// CHECK-NEXT:[[__VLA_EXPR0_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__VLA_EXPR0]] to ptr
+// CHECK-NEXT:[[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr
+// CHECK-NEXT:[[I1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I1]] to ptr
+// CHECK-NEXT:store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8
+// CHECK-NEXT:[[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR_ASCAST]], align 8
+// CHECK-NEXT:[[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @[[GLOB1:[0-9]+]] to ptr), i8 1, i1 true)
+// CHECK-NEXT:[[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK-NEXT:br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK:   user_code.entry:
+// CHECK-NEXT:store i32 10, ptr [[N_ASCAST]], align 4
+// CHECK-NEXT:[[TMP2:%.*]] = load i32, ptr [[N_ASCAST]], align 4
+// CHECK-NEXT:[[TMP3:%.*]] = zext i32 [[TMP2]] to i64
+// CHECK-NEXT:[[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
+// CHECK-NEXT:[[TMP5:%.*]] = add nuw i64 [[TMP4]], 3
+// CHECK-NEXT:[[TMP6:%.*]] = udiv i64 [[TMP5]], 4
+// CHECK-NEXT:[[TMP7:%.*]] = mul nuw i64 [[TMP6]], 4
+// CHECK-NEXT:[[A:%.*]] = call align 4 ptr @__kmpc_alloc_shared(i64 [[TMP7]])
+// CHECK-NEXT:store i64 [[TMP3]], ptr [[__VLA_EXPR0_ASCAST]], align 8
+// 

[PATCH] D153883: [Clang][OpenMP] Delay emission of __kmpc_alloc_shared for escaped VLAs

2023-07-06 Thread Alexey Bataev via Phabricator via cfe-commits
ABataev accepted this revision.
ABataev added a comment.
This revision is now accepted and ready to land.

LG with a nit




Comment at: clang/lib/CodeGen/CGDecl.cpp:19
 #include "CGOpenMPRuntime.h"
+#include "CGOpenMPRuntimeGPU.h"
 #include "CodeGenFunction.h"

You can remove this include


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D153883/new/

https://reviews.llvm.org/D153883

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D153883: [Clang][OpenMP] Delay emission of __kmpc_alloc_shared for escaped VLAs

2023-07-05 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
doru1004 updated this revision to Diff 537498.

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D153883/new/

https://reviews.llvm.org/D153883

Files:
  clang/lib/CodeGen/CGDecl.cpp
  clang/lib/CodeGen/CGOpenMPRuntime.h
  clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
  clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
  clang/lib/CodeGen/CodeGenFunction.h
  clang/test/OpenMP/amdgcn_target_device_vla.cpp

Index: clang/test/OpenMP/amdgcn_target_device_vla.cpp
===
--- /dev/null
+++ clang/test/OpenMP/amdgcn_target_device_vla.cpp
@@ -0,0 +1,1260 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+int foo1() {
+  int sum = 0.0;
+  #pragma omp target map(tofrom: sum)
+  {
+int N = 10;
+int A[N];
+
+for (int i = 0; i < N; i++)
+  A[i] = i;
+
+for (int i = 0; i < N; i++)
+  sum += A[i];
+  }
+  return sum;
+}
+
+int foo2() {
+  int sum = 0.0;
+  int M = 12;
+  int result[M];
+  #pragma omp target teams distribute parallel for map(from: result[:M])
+  for (int i = 0; i < M; i++) {
+int N = 10;
+int A[N];
+result[i] = i;
+
+for (int j = 0; j < N; j++)
+  A[j] = j;
+
+for (int j = 0; j < N; j++)
+  result[i] += A[j];
+  }
+
+  for (int i = 0; i < M; i++)
+sum += result[i];
+  return sum;
+}
+
+int foo3() {
+  int sum = 0.0;
+  int M = 12;
+  int result[M];
+  #pragma omp target teams distribute map(from: result[:M])
+  for (int i = 0; i < M; i++) {
+int N = 10;
+int A[N];
+result[i] = i;
+
+#pragma omp parallel for
+for (int j = 0; j < N; j++)
+  A[j] = j;
+
+for (int j = 0; j < N; j++)
+  result[i] += A[j];
+  }
+
+  for (int i = 0; i < M; i++)
+sum += result[i];
+  return sum;
+}
+
+int foo4() {
+  int sum = 0.0;
+  int M = 12;
+  int result[M];
+  int N = 10;
+  #pragma omp target teams distribute map(from: result[:M])
+  for (int i = 0; i < M; i++) {
+int A[N];
+result[i] = i;
+
+#pragma omp parallel for
+for (int j = 0; j < N; j++)
+  A[j] = j;
+
+for (int j = 0; j < N; j++)
+  result[i] += A[j];
+  }
+
+  for (int i = 0; i < M; i++)
+sum += result[i];
+  return sum;
+}
+
+int main() {
+  return foo1() + foo2() + foo3() + foo4();
+}
+
+#endif
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo1v_l12
+// CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[SUM:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:[[SUM_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT:[[N:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:[[__VLA_EXPR0:%.*]] = alloca i64, align 8, addrspace(5)
+// CHECK-NEXT:[[I:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:[[I1:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:[[SUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR]] to ptr
+// CHECK-NEXT:[[N_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N]] to ptr
+// CHECK-NEXT:[[__VLA_EXPR0_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__VLA_EXPR0]] to ptr
+// CHECK-NEXT:[[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr
+// CHECK-NEXT:[[I1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I1]] to ptr
+// CHECK-NEXT:store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8
+// CHECK-NEXT:[[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR_ASCAST]], align 8
+// CHECK-NEXT:[[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @[[GLOB1:[0-9]+]] to ptr), i8 1, i1 true)
+// CHECK-NEXT:[[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK-NEXT:br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK:   user_code.entry:
+// CHECK-NEXT:store i32 10, ptr [[N_ASCAST]], align 4
+// CHECK-NEXT:[[TMP2:%.*]] = load i32, ptr [[N_ASCAST]], align 4
+// CHECK-NEXT:[[TMP3:%.*]] = zext i32 [[TMP2]] to i64
+// CHECK-NEXT:[[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
+// CHECK-NEXT:[[TMP5:%.*]] = add nuw i64 [[TMP4]], 3
+// CHECK-NEXT:[[TMP6:%.*]] = udiv i64 [[TMP5]], 4
+// CHECK-NEXT:[[TMP7:%.*]] = mul nuw i64 [[TMP6]], 4
+// CHECK-NEXT:[[A:%.*]] = call align 4 ptr @__kmpc_alloc_shared(i64 [[TMP7]])
+// CHECK-NEXT:store i64 [[TMP3]], ptr [[__VLA_EXPR0_ASCAST]], align 8
+// 

[PATCH] D153883: [Clang][OpenMP] Delay emission of __kmpc_alloc_shared for escaped VLAs

2023-07-05 Thread Alexey Bataev via Phabricator via cfe-commits
ABataev added inline comments.



Comment at: clang/lib/CodeGen/CGDecl.cpp:591
+  auto  =
+  *(static_cast(()));
+  RT.getKmpcFreeShared(CGF, AddrSizePair);

Same, just CGOpenMPRuntime  = CGM.getOpenMPRuntime();



Comment at: clang/lib/CodeGen/CGDecl.cpp:1605
+if (getLangOpts().OpenMPIsDevice) {
+  auto  = static_cast(CGM.getOpenMPRuntime());
+  if (RT.isDelayedVariableLengthDecl(*this, )) {

Here and in other places, jusy remove the cast to CGOpenMPRuntimeGPU, 
CGM.getOpenMPRuntime() already provides virtual functions, use them directly 
without cast:
```
CGOpenMPRuntime  = CGM.getOpenMPRuntime();
```



Comment at: clang/lib/CodeGen/CGOpenMPRuntime.h:699-710
+  /// Get call to __kmpc_alloc_shared
+  virtual std::pair
+  getKmpcAllocShared(CodeGenFunction , const VarDecl *VD) {
+llvm_unreachable("not implemented");
+  }
+
+  /// Get call to __kmpc_free_shared

doru1004 wrote:
> @ABataev I have added the interface entries here.
Then you already good, just do not gast to CGOpenMPRuntimeGPU, use 
CGM.getOpenMPRuntime() directly since it already has these member functions.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D153883/new/

https://reviews.llvm.org/D153883

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D153883: [Clang][OpenMP] Delay emission of __kmpc_alloc_shared for escaped VLAs

2023-07-05 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
doru1004 added inline comments.



Comment at: clang/lib/CodeGen/CGOpenMPRuntime.h:699-710
+  /// Get call to __kmpc_alloc_shared
+  virtual std::pair
+  getKmpcAllocShared(CodeGenFunction , const VarDecl *VD) {
+llvm_unreachable("not implemented");
+  }
+
+  /// Get call to __kmpc_free_shared

@ABataev I have added the interface entries here.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D153883/new/

https://reviews.llvm.org/D153883

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D153883: [Clang][OpenMP] Delay emission of __kmpc_alloc_shared for escaped VLAs

2023-07-05 Thread Alexey Bataev via Phabricator via cfe-commits
ABataev added inline comments.



Comment at: clang/lib/CodeGen/CGDecl.cpp:1606
+  CGOpenMPRuntimeGPU  =
+  *(static_cast(()));
+  if (RT.isDelayedVariableLengthDecl(*this, )) {

doru1004 wrote:
> ABataev wrote:
> > ABataev wrote:
> > > 1. use `static_cast(CGM.getOpenMPRuntime())`
> > > 2. It will crash if your device is not GPU. Better to make 
> > > `getKmpcAllocShared` and `getKmpcFreeShared` virtual (just like 
> > > `isDelayedVariableLengthDecl`) in base CGOpenMPRuntime, since it may be 
> > > required not only for GPU-based devices.
> > Check the second item, please, better to make all new member function 
> > virtual and handle it for non-GPU devices too
> The support I am adding is only meant for GPUs. I am not sure why we need to 
> consider non-GPUs. There already exists a VLA handling for non-GPUs and that 
> one should be used.
1. It will crash the compiler if your device is not a GPU (say, CPU).
2. I'm not asking to implement it for non-GPU, I'm asking to provide common 
interface. The general implementation should call just llvm_unreachable, 
nothing else.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D153883/new/

https://reviews.llvm.org/D153883

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D153883: [Clang][OpenMP] Delay emission of __kmpc_alloc_shared for escaped VLAs

2023-07-05 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
doru1004 added inline comments.



Comment at: clang/lib/CodeGen/CGDecl.cpp:1606
+  CGOpenMPRuntimeGPU  =
+  *(static_cast(()));
+  if (RT.isDelayedVariableLengthDecl(*this, )) {

ABataev wrote:
> ABataev wrote:
> > 1. use `static_cast(CGM.getOpenMPRuntime())`
> > 2. It will crash if your device is not GPU. Better to make 
> > `getKmpcAllocShared` and `getKmpcFreeShared` virtual (just like 
> > `isDelayedVariableLengthDecl`) in base CGOpenMPRuntime, since it may be 
> > required not only for GPU-based devices.
> Check the second item, please, better to make all new member function virtual 
> and handle it for non-GPU devices too
The support I am adding is only meant for GPUs. I am not sure why we need to 
consider non-GPUs. There already exists a VLA handling for non-GPUs and that 
one should be used.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D153883/new/

https://reviews.llvm.org/D153883

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D153883: [Clang][OpenMP] Delay emission of __kmpc_alloc_shared for escaped VLAs

2023-07-05 Thread Alexey Bataev via Phabricator via cfe-commits
ABataev added inline comments.



Comment at: clang/lib/CodeGen/CGDecl.cpp:1606
+  CGOpenMPRuntimeGPU  =
+  *(static_cast(()));
+  if (RT.isDelayedVariableLengthDecl(*this, )) {

ABataev wrote:
> 1. use `static_cast(CGM.getOpenMPRuntime())`
> 2. It will crash if your device is not GPU. Better to make 
> `getKmpcAllocShared` and `getKmpcFreeShared` virtual (just like 
> `isDelayedVariableLengthDecl`) in base CGOpenMPRuntime, since it may be 
> required not only for GPU-based devices.
Check the second item, please, better to make all new member function virtual 
and handle it for non-GPU devices too


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D153883/new/

https://reviews.llvm.org/D153883

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D153883: [Clang][OpenMP] Delay emission of __kmpc_alloc_shared for escaped VLAs

2023-07-05 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
doru1004 updated this revision to Diff 537485.

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D153883/new/

https://reviews.llvm.org/D153883

Files:
  clang/lib/CodeGen/CGDecl.cpp
  clang/lib/CodeGen/CGOpenMPRuntime.h
  clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
  clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
  clang/lib/CodeGen/CodeGenFunction.h
  clang/test/OpenMP/amdgcn_target_device_vla.cpp

Index: clang/test/OpenMP/amdgcn_target_device_vla.cpp
===
--- /dev/null
+++ clang/test/OpenMP/amdgcn_target_device_vla.cpp
@@ -0,0 +1,1260 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+int foo1() {
+  int sum = 0.0;
+  #pragma omp target map(tofrom: sum)
+  {
+int N = 10;
+int A[N];
+
+for (int i = 0; i < N; i++)
+  A[i] = i;
+
+for (int i = 0; i < N; i++)
+  sum += A[i];
+  }
+  return sum;
+}
+
+int foo2() {
+  int sum = 0.0;
+  int M = 12;
+  int result[M];
+  #pragma omp target teams distribute parallel for map(from: result[:M])
+  for (int i = 0; i < M; i++) {
+int N = 10;
+int A[N];
+result[i] = i;
+
+for (int j = 0; j < N; j++)
+  A[j] = j;
+
+for (int j = 0; j < N; j++)
+  result[i] += A[j];
+  }
+
+  for (int i = 0; i < M; i++)
+sum += result[i];
+  return sum;
+}
+
+int foo3() {
+  int sum = 0.0;
+  int M = 12;
+  int result[M];
+  #pragma omp target teams distribute map(from: result[:M])
+  for (int i = 0; i < M; i++) {
+int N = 10;
+int A[N];
+result[i] = i;
+
+#pragma omp parallel for
+for (int j = 0; j < N; j++)
+  A[j] = j;
+
+for (int j = 0; j < N; j++)
+  result[i] += A[j];
+  }
+
+  for (int i = 0; i < M; i++)
+sum += result[i];
+  return sum;
+}
+
+int foo4() {
+  int sum = 0.0;
+  int M = 12;
+  int result[M];
+  int N = 10;
+  #pragma omp target teams distribute map(from: result[:M])
+  for (int i = 0; i < M; i++) {
+int A[N];
+result[i] = i;
+
+#pragma omp parallel for
+for (int j = 0; j < N; j++)
+  A[j] = j;
+
+for (int j = 0; j < N; j++)
+  result[i] += A[j];
+  }
+
+  for (int i = 0; i < M; i++)
+sum += result[i];
+  return sum;
+}
+
+int main() {
+  return foo1() + foo2() + foo3() + foo4();
+}
+
+#endif
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo1v_l12
+// CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[SUM:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:[[SUM_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT:[[N:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:[[__VLA_EXPR0:%.*]] = alloca i64, align 8, addrspace(5)
+// CHECK-NEXT:[[I:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:[[I1:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:[[SUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR]] to ptr
+// CHECK-NEXT:[[N_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N]] to ptr
+// CHECK-NEXT:[[__VLA_EXPR0_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__VLA_EXPR0]] to ptr
+// CHECK-NEXT:[[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr
+// CHECK-NEXT:[[I1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I1]] to ptr
+// CHECK-NEXT:store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8
+// CHECK-NEXT:[[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR_ASCAST]], align 8
+// CHECK-NEXT:[[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @[[GLOB1:[0-9]+]] to ptr), i8 1, i1 true)
+// CHECK-NEXT:[[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK-NEXT:br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK:   user_code.entry:
+// CHECK-NEXT:store i32 10, ptr [[N_ASCAST]], align 4
+// CHECK-NEXT:[[TMP2:%.*]] = load i32, ptr [[N_ASCAST]], align 4
+// CHECK-NEXT:[[TMP3:%.*]] = zext i32 [[TMP2]] to i64
+// CHECK-NEXT:[[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
+// CHECK-NEXT:[[TMP5:%.*]] = add nuw i64 [[TMP4]], 3
+// CHECK-NEXT:[[TMP6:%.*]] = udiv i64 [[TMP5]], 4
+// CHECK-NEXT:[[TMP7:%.*]] = mul nuw i64 [[TMP6]], 4
+// CHECK-NEXT:[[A:%.*]] = call align 4 ptr @__kmpc_alloc_shared(i64 [[TMP7]])
+// CHECK-NEXT:store i64 [[TMP3]], 

[PATCH] D153883: [Clang][OpenMP] Delay emission of __kmpc_alloc_shared for escaped VLAs

2023-07-05 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
doru1004 updated this revision to Diff 537478.
doru1004 marked an inline comment as done.

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D153883/new/

https://reviews.llvm.org/D153883

Files:
  clang/lib/CodeGen/CGDecl.cpp
  clang/lib/CodeGen/CGOpenMPRuntime.h
  clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
  clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
  clang/lib/CodeGen/CodeGenFunction.h
  clang/test/OpenMP/amdgcn_target_device_vla.cpp

Index: clang/test/OpenMP/amdgcn_target_device_vla.cpp
===
--- /dev/null
+++ clang/test/OpenMP/amdgcn_target_device_vla.cpp
@@ -0,0 +1,1260 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+int foo1() {
+  int sum = 0.0;
+  #pragma omp target map(tofrom: sum)
+  {
+int N = 10;
+int A[N];
+
+for (int i = 0; i < N; i++)
+  A[i] = i;
+
+for (int i = 0; i < N; i++)
+  sum += A[i];
+  }
+  return sum;
+}
+
+int foo2() {
+  int sum = 0.0;
+  int M = 12;
+  int result[M];
+  #pragma omp target teams distribute parallel for map(from: result[:M])
+  for (int i = 0; i < M; i++) {
+int N = 10;
+int A[N];
+result[i] = i;
+
+for (int j = 0; j < N; j++)
+  A[j] = j;
+
+for (int j = 0; j < N; j++)
+  result[i] += A[j];
+  }
+
+  for (int i = 0; i < M; i++)
+sum += result[i];
+  return sum;
+}
+
+int foo3() {
+  int sum = 0.0;
+  int M = 12;
+  int result[M];
+  #pragma omp target teams distribute map(from: result[:M])
+  for (int i = 0; i < M; i++) {
+int N = 10;
+int A[N];
+result[i] = i;
+
+#pragma omp parallel for
+for (int j = 0; j < N; j++)
+  A[j] = j;
+
+for (int j = 0; j < N; j++)
+  result[i] += A[j];
+  }
+
+  for (int i = 0; i < M; i++)
+sum += result[i];
+  return sum;
+}
+
+int foo4() {
+  int sum = 0.0;
+  int M = 12;
+  int result[M];
+  int N = 10;
+  #pragma omp target teams distribute map(from: result[:M])
+  for (int i = 0; i < M; i++) {
+int A[N];
+result[i] = i;
+
+#pragma omp parallel for
+for (int j = 0; j < N; j++)
+  A[j] = j;
+
+for (int j = 0; j < N; j++)
+  result[i] += A[j];
+  }
+
+  for (int i = 0; i < M; i++)
+sum += result[i];
+  return sum;
+}
+
+int main() {
+  return foo1() + foo2() + foo3() + foo4();
+}
+
+#endif
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo1v_l12
+// CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[SUM:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:[[SUM_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT:[[N:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:[[__VLA_EXPR0:%.*]] = alloca i64, align 8, addrspace(5)
+// CHECK-NEXT:[[I:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:[[I1:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:[[SUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR]] to ptr
+// CHECK-NEXT:[[N_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N]] to ptr
+// CHECK-NEXT:[[__VLA_EXPR0_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__VLA_EXPR0]] to ptr
+// CHECK-NEXT:[[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr
+// CHECK-NEXT:[[I1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I1]] to ptr
+// CHECK-NEXT:store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8
+// CHECK-NEXT:[[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR_ASCAST]], align 8
+// CHECK-NEXT:[[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @[[GLOB1:[0-9]+]] to ptr), i8 1, i1 true)
+// CHECK-NEXT:[[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK-NEXT:br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK:   user_code.entry:
+// CHECK-NEXT:store i32 10, ptr [[N_ASCAST]], align 4
+// CHECK-NEXT:[[TMP2:%.*]] = load i32, ptr [[N_ASCAST]], align 4
+// CHECK-NEXT:[[TMP3:%.*]] = zext i32 [[TMP2]] to i64
+// CHECK-NEXT:[[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
+// CHECK-NEXT:[[TMP5:%.*]] = add nuw i64 [[TMP4]], 3
+// CHECK-NEXT:[[TMP6:%.*]] = udiv i64 [[TMP5]], 4
+// CHECK-NEXT:[[TMP7:%.*]] = mul nuw i64 [[TMP6]], 4
+// CHECK-NEXT:[[A:%.*]] = call align 4 ptr @__kmpc_alloc_shared(i64 

[PATCH] D153883: [Clang][OpenMP] Delay emission of __kmpc_alloc_shared for escaped VLAs

2023-07-03 Thread Alexey Bataev via Phabricator via cfe-commits
ABataev added inline comments.



Comment at: clang/lib/CodeGen/CGDecl.cpp:1606
+  CGOpenMPRuntimeGPU  =
+  *(static_cast(()));
+  if (RT.isDelayedVariableLengthDecl(*this, )) {

1. use `static_cast(CGM.getOpenMPRuntime())`
2. It will crash if your device is not GPU. Better to make `getKmpcAllocShared` 
and `getKmpcFreeShared` virtual (just like `isDelayedVariableLengthDecl`) in 
base CGOpenMPRuntime, since it may be required not only for GPU-based devices.



Comment at: clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp:261
+  else
+DelayedVariableLengthDecls.insert(VD);
+} else

Yep, this is what I meant. The only question: do you really need this new 
parameter? CGF.CapturedStmtInfo provides the list of captures and you can try 
to use it.



Comment at: clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp:1100-1104
+// Check if the size of the VLA is available at this point i.e. check that
+// it has been emitted already. If not available then skip it and use
+// delayed emission of __kmpc_alloc_shared.
+if (llvm::is_contained(I->getSecond().DelayedVariableLengthDecls, VD))
+  continue;

Do you still need this check?


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D153883/new/

https://reviews.llvm.org/D153883

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D153883: [Clang][OpenMP] Delay emission of __kmpc_alloc_shared for escaped VLAs

2023-06-30 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
doru1004 added a comment.

In any case the patch is good to go. It no longer relies on VLA size checks.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D153883/new/

https://reviews.llvm.org/D153883

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D153883: [Clang][OpenMP] Delay emission of __kmpc_alloc_shared for escaped VLAs

2023-06-30 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
doru1004 added a comment.

@ABataev This is as close as I could get it to what you wanted. I don't know 
how to get hold of the target directive so late in the emission process i.e. in 
markedAsEscaped function. The target directive doesn't get visited in the var 
checked for escaped vars so I cannot get the list of captures from it.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D153883/new/

https://reviews.llvm.org/D153883

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D153883: [Clang][OpenMP] Delay emission of __kmpc_alloc_shared for escaped VLAs

2023-06-30 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
doru1004 updated this revision to Diff 536489.

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D153883/new/

https://reviews.llvm.org/D153883

Files:
  clang/lib/CodeGen/CGDecl.cpp
  clang/lib/CodeGen/CGOpenMPRuntime.h
  clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
  clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
  clang/lib/CodeGen/CodeGenFunction.h
  clang/test/OpenMP/amdgcn_target_device_vla.cpp

Index: clang/test/OpenMP/amdgcn_target_device_vla.cpp
===
--- /dev/null
+++ clang/test/OpenMP/amdgcn_target_device_vla.cpp
@@ -0,0 +1,1260 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+int foo1() {
+  int sum = 0.0;
+  #pragma omp target map(tofrom: sum)
+  {
+int N = 10;
+int A[N];
+
+for (int i = 0; i < N; i++)
+  A[i] = i;
+
+for (int i = 0; i < N; i++)
+  sum += A[i];
+  }
+  return sum;
+}
+
+int foo2() {
+  int sum = 0.0;
+  int M = 12;
+  int result[M];
+  #pragma omp target teams distribute parallel for map(from: result[:M])
+  for (int i = 0; i < M; i++) {
+int N = 10;
+int A[N];
+result[i] = i;
+
+for (int j = 0; j < N; j++)
+  A[j] = j;
+
+for (int j = 0; j < N; j++)
+  result[i] += A[j];
+  }
+
+  for (int i = 0; i < M; i++)
+sum += result[i];
+  return sum;
+}
+
+int foo3() {
+  int sum = 0.0;
+  int M = 12;
+  int result[M];
+  #pragma omp target teams distribute map(from: result[:M])
+  for (int i = 0; i < M; i++) {
+int N = 10;
+int A[N];
+result[i] = i;
+
+#pragma omp parallel for
+for (int j = 0; j < N; j++)
+  A[j] = j;
+
+for (int j = 0; j < N; j++)
+  result[i] += A[j];
+  }
+
+  for (int i = 0; i < M; i++)
+sum += result[i];
+  return sum;
+}
+
+int foo4() {
+  int sum = 0.0;
+  int M = 12;
+  int result[M];
+  int N = 10;
+  #pragma omp target teams distribute map(from: result[:M])
+  for (int i = 0; i < M; i++) {
+int A[N];
+result[i] = i;
+
+#pragma omp parallel for
+for (int j = 0; j < N; j++)
+  A[j] = j;
+
+for (int j = 0; j < N; j++)
+  result[i] += A[j];
+  }
+
+  for (int i = 0; i < M; i++)
+sum += result[i];
+  return sum;
+}
+
+int main() {
+  return foo1() + foo2() + foo3() + foo4();
+}
+
+#endif
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo1v_l12
+// CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[SUM:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:[[SUM_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT:[[N:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:[[__VLA_EXPR0:%.*]] = alloca i64, align 8, addrspace(5)
+// CHECK-NEXT:[[I:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:[[I1:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:[[SUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR]] to ptr
+// CHECK-NEXT:[[N_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N]] to ptr
+// CHECK-NEXT:[[__VLA_EXPR0_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__VLA_EXPR0]] to ptr
+// CHECK-NEXT:[[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr
+// CHECK-NEXT:[[I1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I1]] to ptr
+// CHECK-NEXT:store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8
+// CHECK-NEXT:[[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR_ASCAST]], align 8
+// CHECK-NEXT:[[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @[[GLOB1:[0-9]+]] to ptr), i8 1, i1 true)
+// CHECK-NEXT:[[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK-NEXT:br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK:   user_code.entry:
+// CHECK-NEXT:store i32 10, ptr [[N_ASCAST]], align 4
+// CHECK-NEXT:[[TMP2:%.*]] = load i32, ptr [[N_ASCAST]], align 4
+// CHECK-NEXT:[[TMP3:%.*]] = zext i32 [[TMP2]] to i64
+// CHECK-NEXT:[[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
+// CHECK-NEXT:[[TMP5:%.*]] = add nuw i64 [[TMP4]], 3
+// CHECK-NEXT:[[TMP6:%.*]] = udiv i64 [[TMP5]], 4
+// CHECK-NEXT:[[TMP7:%.*]] = mul nuw i64 [[TMP6]], 4
+// CHECK-NEXT:[[A:%.*]] = call align 4 ptr @__kmpc_alloc_shared(i64 [[TMP7]])
+// CHECK-NEXT:store i64 [[TMP3]], ptr [[__VLA_EXPR0_ASCAST]], align 8
+// 

[PATCH] D153883: [Clang][OpenMP] Delay emission of __kmpc_alloc_shared for escaped VLAs

2023-06-30 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
doru1004 added inline comments.



Comment at: clang/lib/CodeGen/CodeGenFunction.h:2806
+  /// Return true if all the emissions for the VLA size have occured.
+  bool hasVLASize(const VariableArrayType *type);
+

ABataev wrote:
> doru1004 wrote:
> > doru1004 wrote:
> > > ABataev wrote:
> > > > doru1004 wrote:
> > > > > doru1004 wrote:
> > > > > > doru1004 wrote:
> > > > > > > ABataev wrote:
> > > > > > > > doru1004 wrote:
> > > > > > > > > ABataev wrote:
> > > > > > > > > > doru1004 wrote:
> > > > > > > > > > > ABataev wrote:
> > > > > > > > > > > > doru1004 wrote:
> > > > > > > > > > > > > ABataev wrote:
> > > > > > > > > > > > > > 1. Is it possible that VariableArrayType does not 
> > > > > > > > > > > > > > have VLA size?
> > > > > > > > > > > > > > 2. Fix param name
> > > > > > > > > > > > > @ABataev How would point 1 happen?
> > > > > > > > > > > > You're adding a function that checks if VLA type has 
> > > > > > > > > > > > VLA size. I'm asking, if it is possible for VLA type to 
> > > > > > > > > > > > not have VLA size at all? Why do you need this function?
> > > > > > > > > > > This function checks if the expression of the size of the 
> > > > > > > > > > > VLA has already been emitted and can be used.
> > > > > > > > > > Why the emission of VLA size can be delayed?
> > > > > > > > > Because the size of the VLA is emitted in the user code and 
> > > > > > > > > the prolog of the function happens before that. The emission 
> > > > > > > > > of the VLA needs to be delayed until its size has been 
> > > > > > > > > emitted in the user code.
> > > > > > > > This is very fragile approach. Can you try instead try to 
> > > > > > > > improve markAsEscaped function and fix insertion of VD to 
> > > > > > > > EscapedVariableLengthDecls and if the declaration is internal 
> > > > > > > > for the target region, insert it to DelayedVariableLengthDecls?
> > > > > > > I am not sure what the condition would be, at that point, to 
> > > > > > > choose between one list or the other. I'm not sure what you mean 
> > > > > > > by the declaration being internal to the target region.
> > > > > > Any thoughts? As far as I can tell all VLAs that reach that point 
> > > > > > belong in `DelayedVariableLengthDecls`
> > > > > @ABataev I cannot think of a condition to use for the distinction in 
> > > > > markedAsEscaped(). Could you please explain in more detail what you 
> > > > > want me to check? I can make the rest of the changes happen no 
> > > > > problem but I don't know what the condition is. Unless you tell me 
> > > > > otherwise, I think the best condition is to check whether the VLA 
> > > > > size has been emitted (i.e. that is is part of the VLASize list) in 
> > > > > which case the code as is now is fine.
> > > > Can you check that the declaration is not captured in the target 
> > > > context? If it is not captured, it is declared in the target region and 
> > > > should be emitted as delayed.
> > > How do I check that? There doesn't seem to be a list of captured 
> > > variables available at that point in the code.
> > > 
> > So the complication is that the same declaration is captured and not 
> > captured at the same time. It can be declared inside a teams distribute 
> > (not captured) but captured by an inner parallel for (captured). I think I 
> > can come up with something though.
> Need to check the captures in the target regions only
I cannot get a handle on the target directive in markedAsEscaped function in 
order to look at its captures.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D153883/new/

https://reviews.llvm.org/D153883

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D153883: [Clang][OpenMP] Delay emission of __kmpc_alloc_shared for escaped VLAs

2023-06-30 Thread Alexey Bataev via Phabricator via cfe-commits
ABataev added inline comments.



Comment at: clang/lib/CodeGen/CodeGenFunction.h:2806
+  /// Return true if all the emissions for the VLA size have occured.
+  bool hasVLASize(const VariableArrayType *type);
+

doru1004 wrote:
> doru1004 wrote:
> > ABataev wrote:
> > > doru1004 wrote:
> > > > doru1004 wrote:
> > > > > doru1004 wrote:
> > > > > > ABataev wrote:
> > > > > > > doru1004 wrote:
> > > > > > > > ABataev wrote:
> > > > > > > > > doru1004 wrote:
> > > > > > > > > > ABataev wrote:
> > > > > > > > > > > doru1004 wrote:
> > > > > > > > > > > > ABataev wrote:
> > > > > > > > > > > > > 1. Is it possible that VariableArrayType does not 
> > > > > > > > > > > > > have VLA size?
> > > > > > > > > > > > > 2. Fix param name
> > > > > > > > > > > > @ABataev How would point 1 happen?
> > > > > > > > > > > You're adding a function that checks if VLA type has VLA 
> > > > > > > > > > > size. I'm asking, if it is possible for VLA type to not 
> > > > > > > > > > > have VLA size at all? Why do you need this function?
> > > > > > > > > > This function checks if the expression of the size of the 
> > > > > > > > > > VLA has already been emitted and can be used.
> > > > > > > > > Why the emission of VLA size can be delayed?
> > > > > > > > Because the size of the VLA is emitted in the user code and the 
> > > > > > > > prolog of the function happens before that. The emission of the 
> > > > > > > > VLA needs to be delayed until its size has been emitted in the 
> > > > > > > > user code.
> > > > > > > This is very fragile approach. Can you try instead try to improve 
> > > > > > > markAsEscaped function and fix insertion of VD to 
> > > > > > > EscapedVariableLengthDecls and if the declaration is internal for 
> > > > > > > the target region, insert it to DelayedVariableLengthDecls?
> > > > > > I am not sure what the condition would be, at that point, to choose 
> > > > > > between one list or the other. I'm not sure what you mean by the 
> > > > > > declaration being internal to the target region.
> > > > > Any thoughts? As far as I can tell all VLAs that reach that point 
> > > > > belong in `DelayedVariableLengthDecls`
> > > > @ABataev I cannot think of a condition to use for the distinction in 
> > > > markedAsEscaped(). Could you please explain in more detail what you 
> > > > want me to check? I can make the rest of the changes happen no problem 
> > > > but I don't know what the condition is. Unless you tell me otherwise, I 
> > > > think the best condition is to check whether the VLA size has been 
> > > > emitted (i.e. that is is part of the VLASize list) in which case the 
> > > > code as is now is fine.
> > > Can you check that the declaration is not captured in the target context? 
> > > If it is not captured, it is declared in the target region and should be 
> > > emitted as delayed.
> > How do I check that? There doesn't seem to be a list of captured variables 
> > available at that point in the code.
> > 
> So the complication is that the same declaration is captured and not captured 
> at the same time. It can be declared inside a teams distribute (not captured) 
> but captured by an inner parallel for (captured). I think I can come up with 
> something though.
Need to check the captures in the target regions only


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D153883/new/

https://reviews.llvm.org/D153883

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D153883: [Clang][OpenMP] Delay emission of __kmpc_alloc_shared for escaped VLAs

2023-06-30 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
doru1004 added inline comments.



Comment at: clang/lib/CodeGen/CodeGenFunction.h:2806
+  /// Return true if all the emissions for the VLA size have occured.
+  bool hasVLASize(const VariableArrayType *type);
+

doru1004 wrote:
> ABataev wrote:
> > doru1004 wrote:
> > > doru1004 wrote:
> > > > doru1004 wrote:
> > > > > ABataev wrote:
> > > > > > doru1004 wrote:
> > > > > > > ABataev wrote:
> > > > > > > > doru1004 wrote:
> > > > > > > > > ABataev wrote:
> > > > > > > > > > doru1004 wrote:
> > > > > > > > > > > ABataev wrote:
> > > > > > > > > > > > 1. Is it possible that VariableArrayType does not have 
> > > > > > > > > > > > VLA size?
> > > > > > > > > > > > 2. Fix param name
> > > > > > > > > > > @ABataev How would point 1 happen?
> > > > > > > > > > You're adding a function that checks if VLA type has VLA 
> > > > > > > > > > size. I'm asking, if it is possible for VLA type to not 
> > > > > > > > > > have VLA size at all? Why do you need this function?
> > > > > > > > > This function checks if the expression of the size of the VLA 
> > > > > > > > > has already been emitted and can be used.
> > > > > > > > Why the emission of VLA size can be delayed?
> > > > > > > Because the size of the VLA is emitted in the user code and the 
> > > > > > > prolog of the function happens before that. The emission of the 
> > > > > > > VLA needs to be delayed until its size has been emitted in the 
> > > > > > > user code.
> > > > > > This is very fragile approach. Can you try instead try to improve 
> > > > > > markAsEscaped function and fix insertion of VD to 
> > > > > > EscapedVariableLengthDecls and if the declaration is internal for 
> > > > > > the target region, insert it to DelayedVariableLengthDecls?
> > > > > I am not sure what the condition would be, at that point, to choose 
> > > > > between one list or the other. I'm not sure what you mean by the 
> > > > > declaration being internal to the target region.
> > > > Any thoughts? As far as I can tell all VLAs that reach that point 
> > > > belong in `DelayedVariableLengthDecls`
> > > @ABataev I cannot think of a condition to use for the distinction in 
> > > markedAsEscaped(). Could you please explain in more detail what you want 
> > > me to check? I can make the rest of the changes happen no problem but I 
> > > don't know what the condition is. Unless you tell me otherwise, I think 
> > > the best condition is to check whether the VLA size has been emitted 
> > > (i.e. that is is part of the VLASize list) in which case the code as is 
> > > now is fine.
> > Can you check that the declaration is not captured in the target context? 
> > If it is not captured, it is declared in the target region and should be 
> > emitted as delayed.
> How do I check that? There doesn't seem to be a list of captured variables 
> available at that point in the code.
> 
So the complication is that the same declaration is captured and not captured 
at the same time. It can be declared inside a teams distribute (not captured) 
but captured by an inner parallel for (captured). I think I can come up with 
something though.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D153883/new/

https://reviews.llvm.org/D153883

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D153883: [Clang][OpenMP] Delay emission of __kmpc_alloc_shared for escaped VLAs

2023-06-30 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
doru1004 added inline comments.



Comment at: clang/lib/CodeGen/CodeGenFunction.h:2806
+  /// Return true if all the emissions for the VLA size have occured.
+  bool hasVLASize(const VariableArrayType *type);
+

ABataev wrote:
> doru1004 wrote:
> > doru1004 wrote:
> > > doru1004 wrote:
> > > > ABataev wrote:
> > > > > doru1004 wrote:
> > > > > > ABataev wrote:
> > > > > > > doru1004 wrote:
> > > > > > > > ABataev wrote:
> > > > > > > > > doru1004 wrote:
> > > > > > > > > > ABataev wrote:
> > > > > > > > > > > 1. Is it possible that VariableArrayType does not have 
> > > > > > > > > > > VLA size?
> > > > > > > > > > > 2. Fix param name
> > > > > > > > > > @ABataev How would point 1 happen?
> > > > > > > > > You're adding a function that checks if VLA type has VLA 
> > > > > > > > > size. I'm asking, if it is possible for VLA type to not have 
> > > > > > > > > VLA size at all? Why do you need this function?
> > > > > > > > This function checks if the expression of the size of the VLA 
> > > > > > > > has already been emitted and can be used.
> > > > > > > Why the emission of VLA size can be delayed?
> > > > > > Because the size of the VLA is emitted in the user code and the 
> > > > > > prolog of the function happens before that. The emission of the VLA 
> > > > > > needs to be delayed until its size has been emitted in the user 
> > > > > > code.
> > > > > This is very fragile approach. Can you try instead try to improve 
> > > > > markAsEscaped function and fix insertion of VD to 
> > > > > EscapedVariableLengthDecls and if the declaration is internal for the 
> > > > > target region, insert it to DelayedVariableLengthDecls?
> > > > I am not sure what the condition would be, at that point, to choose 
> > > > between one list or the other. I'm not sure what you mean by the 
> > > > declaration being internal to the target region.
> > > Any thoughts? As far as I can tell all VLAs that reach that point belong 
> > > in `DelayedVariableLengthDecls`
> > @ABataev I cannot think of a condition to use for the distinction in 
> > markedAsEscaped(). Could you please explain in more detail what you want me 
> > to check? I can make the rest of the changes happen no problem but I don't 
> > know what the condition is. Unless you tell me otherwise, I think the best 
> > condition is to check whether the VLA size has been emitted (i.e. that is 
> > is part of the VLASize list) in which case the code as is now is fine.
> Can you check that the declaration is not captured in the target context? If 
> it is not captured, it is declared in the target region and should be emitted 
> as delayed.
How do I check that? There doesn't seem to be a list of captured variables 
available at that point in the code.



Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D153883/new/

https://reviews.llvm.org/D153883

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D153883: [Clang][OpenMP] Delay emission of __kmpc_alloc_shared for escaped VLAs

2023-06-30 Thread Alexey Bataev via Phabricator via cfe-commits
ABataev added inline comments.



Comment at: clang/lib/CodeGen/CodeGenFunction.h:2806
+  /// Return true if all the emissions for the VLA size have occured.
+  bool hasVLASize(const VariableArrayType *type);
+

doru1004 wrote:
> doru1004 wrote:
> > doru1004 wrote:
> > > ABataev wrote:
> > > > doru1004 wrote:
> > > > > ABataev wrote:
> > > > > > doru1004 wrote:
> > > > > > > ABataev wrote:
> > > > > > > > doru1004 wrote:
> > > > > > > > > ABataev wrote:
> > > > > > > > > > 1. Is it possible that VariableArrayType does not have VLA 
> > > > > > > > > > size?
> > > > > > > > > > 2. Fix param name
> > > > > > > > > @ABataev How would point 1 happen?
> > > > > > > > You're adding a function that checks if VLA type has VLA size. 
> > > > > > > > I'm asking, if it is possible for VLA type to not have VLA size 
> > > > > > > > at all? Why do you need this function?
> > > > > > > This function checks if the expression of the size of the VLA has 
> > > > > > > already been emitted and can be used.
> > > > > > Why the emission of VLA size can be delayed?
> > > > > Because the size of the VLA is emitted in the user code and the 
> > > > > prolog of the function happens before that. The emission of the VLA 
> > > > > needs to be delayed until its size has been emitted in the user code.
> > > > This is very fragile approach. Can you try instead try to improve 
> > > > markAsEscaped function and fix insertion of VD to 
> > > > EscapedVariableLengthDecls and if the declaration is internal for the 
> > > > target region, insert it to DelayedVariableLengthDecls?
> > > I am not sure what the condition would be, at that point, to choose 
> > > between one list or the other. I'm not sure what you mean by the 
> > > declaration being internal to the target region.
> > Any thoughts? As far as I can tell all VLAs that reach that point belong in 
> > `DelayedVariableLengthDecls`
> @ABataev I cannot think of a condition to use for the distinction in 
> markedAsEscaped(). Could you please explain in more detail what you want me 
> to check? I can make the rest of the changes happen no problem but I don't 
> know what the condition is. Unless you tell me otherwise, I think the best 
> condition is to check whether the VLA size has been emitted (i.e. that is is 
> part of the VLASize list) in which case the code as is now is fine.
Can you check that the declaration is not captured in the target context? If it 
is not captured, it is declared in the target region and should be emitted as 
delayed.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D153883/new/

https://reviews.llvm.org/D153883

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D153883: [Clang][OpenMP] Delay emission of __kmpc_alloc_shared for escaped VLAs

2023-06-30 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
doru1004 added inline comments.



Comment at: clang/lib/CodeGen/CodeGenFunction.h:2806
+  /// Return true if all the emissions for the VLA size have occured.
+  bool hasVLASize(const VariableArrayType *type);
+

doru1004 wrote:
> doru1004 wrote:
> > ABataev wrote:
> > > doru1004 wrote:
> > > > ABataev wrote:
> > > > > doru1004 wrote:
> > > > > > ABataev wrote:
> > > > > > > doru1004 wrote:
> > > > > > > > ABataev wrote:
> > > > > > > > > 1. Is it possible that VariableArrayType does not have VLA 
> > > > > > > > > size?
> > > > > > > > > 2. Fix param name
> > > > > > > > @ABataev How would point 1 happen?
> > > > > > > You're adding a function that checks if VLA type has VLA size. 
> > > > > > > I'm asking, if it is possible for VLA type to not have VLA size 
> > > > > > > at all? Why do you need this function?
> > > > > > This function checks if the expression of the size of the VLA has 
> > > > > > already been emitted and can be used.
> > > > > Why the emission of VLA size can be delayed?
> > > > Because the size of the VLA is emitted in the user code and the prolog 
> > > > of the function happens before that. The emission of the VLA needs to 
> > > > be delayed until its size has been emitted in the user code.
> > > This is very fragile approach. Can you try instead try to improve 
> > > markAsEscaped function and fix insertion of VD to 
> > > EscapedVariableLengthDecls and if the declaration is internal for the 
> > > target region, insert it to DelayedVariableLengthDecls?
> > I am not sure what the condition would be, at that point, to choose between 
> > one list or the other. I'm not sure what you mean by the declaration being 
> > internal to the target region.
> Any thoughts? As far as I can tell all VLAs that reach that point belong in 
> `DelayedVariableLengthDecls`
@ABataev I cannot think of a condition to use for the distinction in 
markedAsEscaped(). Could you please explain in more detail what you want me to 
check? I can make the rest of the changes happen no problem but I don't know 
what the condition is. Unless you tell me otherwise, I think the best condition 
is to check whether the VLA size has been emitted (i.e. that is is part of the 
VLASize list) in which case the code as is now is fine.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D153883/new/

https://reviews.llvm.org/D153883

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D153883: [Clang][OpenMP] Delay emission of __kmpc_alloc_shared for escaped VLAs

2023-06-30 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
doru1004 added inline comments.



Comment at: clang/lib/CodeGen/CodeGenFunction.h:2806
+  /// Return true if all the emissions for the VLA size have occured.
+  bool hasVLASize(const VariableArrayType *type);
+

doru1004 wrote:
> ABataev wrote:
> > doru1004 wrote:
> > > ABataev wrote:
> > > > doru1004 wrote:
> > > > > ABataev wrote:
> > > > > > doru1004 wrote:
> > > > > > > ABataev wrote:
> > > > > > > > 1. Is it possible that VariableArrayType does not have VLA size?
> > > > > > > > 2. Fix param name
> > > > > > > @ABataev How would point 1 happen?
> > > > > > You're adding a function that checks if VLA type has VLA size. I'm 
> > > > > > asking, if it is possible for VLA type to not have VLA size at all? 
> > > > > > Why do you need this function?
> > > > > This function checks if the expression of the size of the VLA has 
> > > > > already been emitted and can be used.
> > > > Why the emission of VLA size can be delayed?
> > > Because the size of the VLA is emitted in the user code and the prolog of 
> > > the function happens before that. The emission of the VLA needs to be 
> > > delayed until its size has been emitted in the user code.
> > This is very fragile approach. Can you try instead try to improve 
> > markAsEscaped function and fix insertion of VD to 
> > EscapedVariableLengthDecls and if the declaration is internal for the 
> > target region, insert it to DelayedVariableLengthDecls?
> I am not sure what the condition would be, at that point, to choose between 
> one list or the other. I'm not sure what you mean by the declaration being 
> internal to the target region.
Any thoughts? As far as I can tell all VLAs that reach that point belong in 
`DelayedVariableLengthDecls`


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D153883/new/

https://reviews.llvm.org/D153883

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D153883: [Clang][OpenMP] Delay emission of __kmpc_alloc_shared for escaped VLAs

2023-06-30 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
doru1004 added inline comments.



Comment at: clang/lib/CodeGen/CodeGenFunction.h:2806
+  /// Return true if all the emissions for the VLA size have occured.
+  bool hasVLASize(const VariableArrayType *type);
+

ABataev wrote:
> doru1004 wrote:
> > ABataev wrote:
> > > doru1004 wrote:
> > > > ABataev wrote:
> > > > > doru1004 wrote:
> > > > > > ABataev wrote:
> > > > > > > 1. Is it possible that VariableArrayType does not have VLA size?
> > > > > > > 2. Fix param name
> > > > > > @ABataev How would point 1 happen?
> > > > > You're adding a function that checks if VLA type has VLA size. I'm 
> > > > > asking, if it is possible for VLA type to not have VLA size at all? 
> > > > > Why do you need this function?
> > > > This function checks if the expression of the size of the VLA has 
> > > > already been emitted and can be used.
> > > Why the emission of VLA size can be delayed?
> > Because the size of the VLA is emitted in the user code and the prolog of 
> > the function happens before that. The emission of the VLA needs to be 
> > delayed until its size has been emitted in the user code.
> This is very fragile approach. Can you try instead try to improve 
> markAsEscaped function and fix insertion of VD to EscapedVariableLengthDecls 
> and if the declaration is internal for the target region, insert it to 
> DelayedVariableLengthDecls?
I am not sure what the condition would be, at that point, to choose between one 
list or the other. I'm not sure what you mean by the declaration being internal 
to the target region.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D153883/new/

https://reviews.llvm.org/D153883

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D153883: [Clang][OpenMP] Delay emission of __kmpc_alloc_shared for escaped VLAs

2023-06-30 Thread Alexey Bataev via Phabricator via cfe-commits
ABataev added inline comments.



Comment at: clang/lib/CodeGen/CodeGenFunction.h:2806
+  /// Return true if all the emissions for the VLA size have occured.
+  bool hasVLASize(const VariableArrayType *type);
+

doru1004 wrote:
> ABataev wrote:
> > doru1004 wrote:
> > > ABataev wrote:
> > > > doru1004 wrote:
> > > > > ABataev wrote:
> > > > > > 1. Is it possible that VariableArrayType does not have VLA size?
> > > > > > 2. Fix param name
> > > > > @ABataev How would point 1 happen?
> > > > You're adding a function that checks if VLA type has VLA size. I'm 
> > > > asking, if it is possible for VLA type to not have VLA size at all? Why 
> > > > do you need this function?
> > > This function checks if the expression of the size of the VLA has already 
> > > been emitted and can be used.
> > Why the emission of VLA size can be delayed?
> Because the size of the VLA is emitted in the user code and the prolog of the 
> function happens before that. The emission of the VLA needs to be delayed 
> until its size has been emitted in the user code.
This is very fragile approach. Can you try instead try to improve markAsEscaped 
function and fix insertion of VD to EscapedVariableLengthDecls and if the 
declaration is internal for the target region, insert it to 
DelayedVariableLengthDecls?


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D153883/new/

https://reviews.llvm.org/D153883

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D153883: [Clang][OpenMP] Delay emission of __kmpc_alloc_shared for escaped VLAs

2023-06-30 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
doru1004 updated this revision to Diff 536326.

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D153883/new/

https://reviews.llvm.org/D153883

Files:
  clang/lib/CodeGen/CGDecl.cpp
  clang/lib/CodeGen/CGOpenMPRuntime.h
  clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
  clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
  clang/lib/CodeGen/CodeGenFunction.cpp
  clang/lib/CodeGen/CodeGenFunction.h
  clang/test/OpenMP/amdgcn_target_device_vla.cpp

Index: clang/test/OpenMP/amdgcn_target_device_vla.cpp
===
--- /dev/null
+++ clang/test/OpenMP/amdgcn_target_device_vla.cpp
@@ -0,0 +1,1260 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+int foo1() {
+  int sum = 0.0;
+  #pragma omp target map(tofrom: sum)
+  {
+int N = 10;
+int A[N];
+
+for (int i = 0; i < N; i++)
+  A[i] = i;
+
+for (int i = 0; i < N; i++)
+  sum += A[i];
+  }
+  return sum;
+}
+
+int foo2() {
+  int sum = 0.0;
+  int M = 12;
+  int result[M];
+  #pragma omp target teams distribute parallel for map(from: result[:M])
+  for (int i = 0; i < M; i++) {
+int N = 10;
+int A[N];
+result[i] = i;
+
+for (int j = 0; j < N; j++)
+  A[j] = j;
+
+for (int j = 0; j < N; j++)
+  result[i] += A[j];
+  }
+
+  for (int i = 0; i < M; i++)
+sum += result[i];
+  return sum;
+}
+
+int foo3() {
+  int sum = 0.0;
+  int M = 12;
+  int result[M];
+  #pragma omp target teams distribute map(from: result[:M])
+  for (int i = 0; i < M; i++) {
+int N = 10;
+int A[N];
+result[i] = i;
+
+#pragma omp parallel for
+for (int j = 0; j < N; j++)
+  A[j] = j;
+
+for (int j = 0; j < N; j++)
+  result[i] += A[j];
+  }
+
+  for (int i = 0; i < M; i++)
+sum += result[i];
+  return sum;
+}
+
+int foo4() {
+  int sum = 0.0;
+  int M = 12;
+  int result[M];
+  int N = 10;
+  #pragma omp target teams distribute map(from: result[:M])
+  for (int i = 0; i < M; i++) {
+int A[N];
+result[i] = i;
+
+#pragma omp parallel for
+for (int j = 0; j < N; j++)
+  A[j] = j;
+
+for (int j = 0; j < N; j++)
+  result[i] += A[j];
+  }
+
+  for (int i = 0; i < M; i++)
+sum += result[i];
+  return sum;
+}
+
+int main() {
+  return foo1() + foo2() + foo3() + foo4();
+}
+
+#endif
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo1v_l12
+// CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[SUM:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:[[SUM_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT:[[N:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:[[__VLA_EXPR0:%.*]] = alloca i64, align 8, addrspace(5)
+// CHECK-NEXT:[[I:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:[[I1:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:[[SUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR]] to ptr
+// CHECK-NEXT:[[N_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N]] to ptr
+// CHECK-NEXT:[[__VLA_EXPR0_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__VLA_EXPR0]] to ptr
+// CHECK-NEXT:[[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr
+// CHECK-NEXT:[[I1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I1]] to ptr
+// CHECK-NEXT:store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8
+// CHECK-NEXT:[[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR_ASCAST]], align 8
+// CHECK-NEXT:[[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @[[GLOB1:[0-9]+]] to ptr), i8 1, i1 true)
+// CHECK-NEXT:[[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK-NEXT:br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK:   user_code.entry:
+// CHECK-NEXT:store i32 10, ptr [[N_ASCAST]], align 4
+// CHECK-NEXT:[[TMP2:%.*]] = load i32, ptr [[N_ASCAST]], align 4
+// CHECK-NEXT:[[TMP3:%.*]] = zext i32 [[TMP2]] to i64
+// CHECK-NEXT:[[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
+// CHECK-NEXT:[[TMP5:%.*]] = add nuw i64 [[TMP4]], 3
+// CHECK-NEXT:[[TMP6:%.*]] = udiv i64 [[TMP5]], 4
+// CHECK-NEXT:[[TMP7:%.*]] = mul nuw i64 [[TMP6]], 4
+// CHECK-NEXT:[[A:%.*]] = call align 4 ptr @__kmpc_alloc_shared(i64 [[TMP7]])

[PATCH] D153883: [Clang][OpenMP] Delay emission of __kmpc_alloc_shared for escaped VLAs

2023-06-30 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
doru1004 added inline comments.



Comment at: clang/lib/CodeGen/CodeGenFunction.h:2806
+  /// Return true if all the emissions for the VLA size have occured.
+  bool hasVLASize(const VariableArrayType *type);
+

ABataev wrote:
> doru1004 wrote:
> > ABataev wrote:
> > > doru1004 wrote:
> > > > ABataev wrote:
> > > > > 1. Is it possible that VariableArrayType does not have VLA size?
> > > > > 2. Fix param name
> > > > @ABataev How would point 1 happen?
> > > You're adding a function that checks if VLA type has VLA size. I'm 
> > > asking, if it is possible for VLA type to not have VLA size at all? Why 
> > > do you need this function?
> > This function checks if the expression of the size of the VLA has already 
> > been emitted and can be used.
> Why the emission of VLA size can be delayed?
Because the size of the VLA is emitted in the user code and the prolog of the 
function happens before that. The emission of the VLA needs to be delayed until 
its size has been emitted in the user code.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D153883/new/

https://reviews.llvm.org/D153883

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D153883: [Clang][OpenMP] Delay emission of __kmpc_alloc_shared for escaped VLAs

2023-06-30 Thread Alexey Bataev via Phabricator via cfe-commits
ABataev added inline comments.



Comment at: clang/lib/CodeGen/CodeGenFunction.cpp:2168
+ElementType = VLAType->getElementType();
+llvm::Value *VLASize = VLASizeMap[VLAType->getSizeExpr()];
+if (!VLASize)

Use VLASizeMap.find() instead



Comment at: clang/lib/CodeGen/CodeGenFunction.h:2806
+  /// Return true if all the emissions for the VLA size have occured.
+  bool hasVLASize(const VariableArrayType *type);
+

doru1004 wrote:
> ABataev wrote:
> > doru1004 wrote:
> > > ABataev wrote:
> > > > 1. Is it possible that VariableArrayType does not have VLA size?
> > > > 2. Fix param name
> > > @ABataev How would point 1 happen?
> > You're adding a function that checks if VLA type has VLA size. I'm asking, 
> > if it is possible for VLA type to not have VLA size at all? Why do you need 
> > this function?
> This function checks if the expression of the size of the VLA has already 
> been emitted and can be used.
Why the emission of VLA size can be delayed?


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D153883/new/

https://reviews.llvm.org/D153883

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D153883: [Clang][OpenMP] Delay emission of __kmpc_alloc_shared for escaped VLAs

2023-06-30 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
doru1004 updated this revision to Diff 536322.

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D153883/new/

https://reviews.llvm.org/D153883

Files:
  clang/lib/CodeGen/CGDecl.cpp
  clang/lib/CodeGen/CGOpenMPRuntime.h
  clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
  clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
  clang/lib/CodeGen/CodeGenFunction.cpp
  clang/lib/CodeGen/CodeGenFunction.h
  clang/test/OpenMP/amdgcn_target_device_vla.cpp

Index: clang/test/OpenMP/amdgcn_target_device_vla.cpp
===
--- /dev/null
+++ clang/test/OpenMP/amdgcn_target_device_vla.cpp
@@ -0,0 +1,1260 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+int foo1() {
+  int sum = 0.0;
+  #pragma omp target map(tofrom: sum)
+  {
+int N = 10;
+int A[N];
+
+for (int i = 0; i < N; i++)
+  A[i] = i;
+
+for (int i = 0; i < N; i++)
+  sum += A[i];
+  }
+  return sum;
+}
+
+int foo2() {
+  int sum = 0.0;
+  int M = 12;
+  int result[M];
+  #pragma omp target teams distribute parallel for map(from: result[:M])
+  for (int i = 0; i < M; i++) {
+int N = 10;
+int A[N];
+result[i] = i;
+
+for (int j = 0; j < N; j++)
+  A[j] = j;
+
+for (int j = 0; j < N; j++)
+  result[i] += A[j];
+  }
+
+  for (int i = 0; i < M; i++)
+sum += result[i];
+  return sum;
+}
+
+int foo3() {
+  int sum = 0.0;
+  int M = 12;
+  int result[M];
+  #pragma omp target teams distribute map(from: result[:M])
+  for (int i = 0; i < M; i++) {
+int N = 10;
+int A[N];
+result[i] = i;
+
+#pragma omp parallel for
+for (int j = 0; j < N; j++)
+  A[j] = j;
+
+for (int j = 0; j < N; j++)
+  result[i] += A[j];
+  }
+
+  for (int i = 0; i < M; i++)
+sum += result[i];
+  return sum;
+}
+
+int foo4() {
+  int sum = 0.0;
+  int M = 12;
+  int result[M];
+  int N = 10;
+  #pragma omp target teams distribute map(from: result[:M])
+  for (int i = 0; i < M; i++) {
+int A[N];
+result[i] = i;
+
+#pragma omp parallel for
+for (int j = 0; j < N; j++)
+  A[j] = j;
+
+for (int j = 0; j < N; j++)
+  result[i] += A[j];
+  }
+
+  for (int i = 0; i < M; i++)
+sum += result[i];
+  return sum;
+}
+
+int main() {
+  return foo1() + foo2() + foo3() + foo4();
+}
+
+#endif
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo1v_l12
+// CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[SUM:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:[[SUM_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT:[[N:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:[[__VLA_EXPR0:%.*]] = alloca i64, align 8, addrspace(5)
+// CHECK-NEXT:[[I:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:[[I1:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:[[SUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR]] to ptr
+// CHECK-NEXT:[[N_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N]] to ptr
+// CHECK-NEXT:[[__VLA_EXPR0_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__VLA_EXPR0]] to ptr
+// CHECK-NEXT:[[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr
+// CHECK-NEXT:[[I1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I1]] to ptr
+// CHECK-NEXT:store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8
+// CHECK-NEXT:[[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR_ASCAST]], align 8
+// CHECK-NEXT:[[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @[[GLOB1:[0-9]+]] to ptr), i8 1, i1 true)
+// CHECK-NEXT:[[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK-NEXT:br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK:   user_code.entry:
+// CHECK-NEXT:store i32 10, ptr [[N_ASCAST]], align 4
+// CHECK-NEXT:[[TMP2:%.*]] = load i32, ptr [[N_ASCAST]], align 4
+// CHECK-NEXT:[[TMP3:%.*]] = zext i32 [[TMP2]] to i64
+// CHECK-NEXT:[[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
+// CHECK-NEXT:[[TMP5:%.*]] = add nuw i64 [[TMP4]], 3
+// CHECK-NEXT:[[TMP6:%.*]] = udiv i64 [[TMP5]], 4
+// CHECK-NEXT:[[TMP7:%.*]] = mul nuw i64 [[TMP6]], 4
+// CHECK-NEXT:[[A:%.*]] = call align 4 ptr @__kmpc_alloc_shared(i64 [[TMP7]])

[PATCH] D153883: [Clang][OpenMP] Delay emission of __kmpc_alloc_shared for escaped VLAs

2023-06-30 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
doru1004 added inline comments.



Comment at: clang/lib/CodeGen/CGDecl.cpp:1605-1606
+if (getLangOpts().OpenMPIsDevice) {
+  CGOpenMPRuntimeGPU  =
+  *(static_cast(()));
+  if (RT.isDelayedVariableLengthDecl(*this, )) {

ABataev wrote:
> No need to cast to CGOpenMPRuntimeGPU since isDelayedVariableLengthDecl is a 
> member of CGOpenMPRuntime.
RT is also used further down to call getKmpcAllocShared().



Comment at: clang/lib/CodeGen/CodeGenFunction.h:2806
+  /// Return true if all the emissions for the VLA size have occured.
+  bool hasVLASize(const VariableArrayType *type);
+

ABataev wrote:
> doru1004 wrote:
> > ABataev wrote:
> > > 1. Is it possible that VariableArrayType does not have VLA size?
> > > 2. Fix param name
> > @ABataev How would point 1 happen?
> You're adding a function that checks if VLA type has VLA size. I'm asking, if 
> it is possible for VLA type to not have VLA size at all? Why do you need this 
> function?
This function checks if the expression of the size of the VLA has already been 
emitted and can be used.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D153883/new/

https://reviews.llvm.org/D153883

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D153883: [Clang][OpenMP] Delay emission of __kmpc_alloc_shared for escaped VLAs

2023-06-30 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
doru1004 updated this revision to Diff 536321.
doru1004 marked 4 inline comments as done.

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D153883/new/

https://reviews.llvm.org/D153883

Files:
  clang/lib/CodeGen/CGDecl.cpp
  clang/lib/CodeGen/CGOpenMPRuntime.h
  clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
  clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
  clang/lib/CodeGen/CodeGenFunction.cpp
  clang/lib/CodeGen/CodeGenFunction.h
  clang/test/OpenMP/amdgcn_target_device_vla.cpp

Index: clang/test/OpenMP/amdgcn_target_device_vla.cpp
===
--- /dev/null
+++ clang/test/OpenMP/amdgcn_target_device_vla.cpp
@@ -0,0 +1,1260 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+int foo1() {
+  int sum = 0.0;
+  #pragma omp target map(tofrom: sum)
+  {
+int N = 10;
+int A[N];
+
+for (int i = 0; i < N; i++)
+  A[i] = i;
+
+for (int i = 0; i < N; i++)
+  sum += A[i];
+  }
+  return sum;
+}
+
+int foo2() {
+  int sum = 0.0;
+  int M = 12;
+  int result[M];
+  #pragma omp target teams distribute parallel for map(from: result[:M])
+  for (int i = 0; i < M; i++) {
+int N = 10;
+int A[N];
+result[i] = i;
+
+for (int j = 0; j < N; j++)
+  A[j] = j;
+
+for (int j = 0; j < N; j++)
+  result[i] += A[j];
+  }
+
+  for (int i = 0; i < M; i++)
+sum += result[i];
+  return sum;
+}
+
+int foo3() {
+  int sum = 0.0;
+  int M = 12;
+  int result[M];
+  #pragma omp target teams distribute map(from: result[:M])
+  for (int i = 0; i < M; i++) {
+int N = 10;
+int A[N];
+result[i] = i;
+
+#pragma omp parallel for
+for (int j = 0; j < N; j++)
+  A[j] = j;
+
+for (int j = 0; j < N; j++)
+  result[i] += A[j];
+  }
+
+  for (int i = 0; i < M; i++)
+sum += result[i];
+  return sum;
+}
+
+int foo4() {
+  int sum = 0.0;
+  int M = 12;
+  int result[M];
+  int N = 10;
+  #pragma omp target teams distribute map(from: result[:M])
+  for (int i = 0; i < M; i++) {
+int A[N];
+result[i] = i;
+
+#pragma omp parallel for
+for (int j = 0; j < N; j++)
+  A[j] = j;
+
+for (int j = 0; j < N; j++)
+  result[i] += A[j];
+  }
+
+  for (int i = 0; i < M; i++)
+sum += result[i];
+  return sum;
+}
+
+int main() {
+  return foo1() + foo2() + foo3() + foo4();
+}
+
+#endif
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo1v_l12
+// CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[SUM:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:[[SUM_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT:[[N:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:[[__VLA_EXPR0:%.*]] = alloca i64, align 8, addrspace(5)
+// CHECK-NEXT:[[I:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:[[I1:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:[[SUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR]] to ptr
+// CHECK-NEXT:[[N_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N]] to ptr
+// CHECK-NEXT:[[__VLA_EXPR0_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__VLA_EXPR0]] to ptr
+// CHECK-NEXT:[[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr
+// CHECK-NEXT:[[I1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I1]] to ptr
+// CHECK-NEXT:store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8
+// CHECK-NEXT:[[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR_ASCAST]], align 8
+// CHECK-NEXT:[[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @[[GLOB1:[0-9]+]] to ptr), i8 1, i1 true)
+// CHECK-NEXT:[[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK-NEXT:br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK:   user_code.entry:
+// CHECK-NEXT:store i32 10, ptr [[N_ASCAST]], align 4
+// CHECK-NEXT:[[TMP2:%.*]] = load i32, ptr [[N_ASCAST]], align 4
+// CHECK-NEXT:[[TMP3:%.*]] = zext i32 [[TMP2]] to i64
+// CHECK-NEXT:[[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
+// CHECK-NEXT:[[TMP5:%.*]] = add nuw i64 [[TMP4]], 3
+// CHECK-NEXT:[[TMP6:%.*]] = udiv i64 [[TMP5]], 4
+// CHECK-NEXT:[[TMP7:%.*]] = mul nuw i64 [[TMP6]], 4
+// CHECK-NEXT:[[A:%.*]] = call align 

[PATCH] D153883: [Clang][OpenMP] Delay emission of __kmpc_alloc_shared for escaped VLAs

2023-06-30 Thread Alexey Bataev via Phabricator via cfe-commits
ABataev added inline comments.



Comment at: clang/lib/CodeGen/CodeGenFunction.h:2806
+  /// Return true if all the emissions for the VLA size have occured.
+  bool hasVLASize(const VariableArrayType *type);
+

doru1004 wrote:
> ABataev wrote:
> > 1. Is it possible that VariableArrayType does not have VLA size?
> > 2. Fix param name
> @ABataev How would point 1 happen?
You're adding a function that checks if VLA type has VLA size. I'm asking, if 
it is possible for VLA type to not have VLA size at all? Why do you need this 
function?


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D153883/new/

https://reviews.llvm.org/D153883

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D153883: [Clang][OpenMP] Delay emission of __kmpc_alloc_shared for escaped VLAs

2023-06-30 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
doru1004 added inline comments.



Comment at: clang/lib/CodeGen/CodeGenFunction.h:2806
+  /// Return true if all the emissions for the VLA size have occured.
+  bool hasVLASize(const VariableArrayType *type);
+

ABataev wrote:
> 1. Is it possible that VariableArrayType does not have VLA size?
> 2. Fix param name
@ABataev How would point 1 happen?


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D153883/new/

https://reviews.llvm.org/D153883

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D153883: [Clang][OpenMP] Delay emission of __kmpc_alloc_shared for escaped VLAs

2023-06-30 Thread Alexey Bataev via Phabricator via cfe-commits
ABataev added inline comments.



Comment at: clang/lib/CodeGen/CGDecl.cpp:590-591
+void Emit(CodeGenFunction , Flags EmissionFlags) override {
+  CGOpenMPRuntimeGPU  =
+  *(static_cast(()));
+  RT.getKmpcFreeShared(CGF, AddrSizePair);

```
auto  = static_cast(...);
```



Comment at: clang/lib/CodeGen/CGDecl.cpp:1605-1606
+if (getLangOpts().OpenMPIsDevice) {
+  CGOpenMPRuntimeGPU  =
+  *(static_cast(()));
+  if (RT.isDelayedVariableLengthDecl(*this, )) {

No need to cast to CGOpenMPRuntimeGPU since isDelayedVariableLengthDecl is a 
member of CGOpenMPRuntime.



Comment at: clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp:1116-1120
+  for (const auto *DelayedD : I->getSecond().DelayedVariableLengthDecls)
+if (DelayedD == VD)
+  return true;
+
+  return false;

```
return llvm::is_contained(I->getSecond().DelayedVariableLengthDecls, VD);
```



Comment at: clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp:1152
+CodeGenFunction ,
+std::pair AddrSizePair) {
+  // Deallocate the memory for each globalized VLA object

pass it here and in other places as const reference



Comment at: clang/lib/CodeGen/CodeGenFunction.h:2806
+  /// Return true if all the emissions for the VLA size have occured.
+  bool hasVLASize(const VariableArrayType *type);
+

1. Is it possible that VariableArrayType does not have VLA size?
2. Fix param name


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D153883/new/

https://reviews.llvm.org/D153883

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D153883: [Clang][OpenMP] Delay emission of __kmpc_alloc_shared for escaped VLAs

2023-06-30 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
doru1004 marked 3 inline comments as done.
doru1004 added inline comments.



Comment at: clang/lib/CodeGen/CGDecl.cpp:1605-1609
+(CGM.getContext().getTargetInfo().getTriple().isAMDGPU() ||
+ CGM.getContext().getTargetInfo().getTriple().isNVPTX())) {
+  CGOpenMPRuntimeGPU  =
+  *(static_cast(()));
+  if (RT.isDelayedVariableLengthDecl(*this, )) {

ABataev wrote:
> doru1004 wrote:
> > ABataev wrote:
> > > I think you can drop triple checks and rely completely on 
> > > RT.isDelayedVariableLengthDecl(*this, ) result here
> > I tried it but there is a lit test (which I cannot identify) that hangs 
> > when offloading to the host (I think) so it has to be an actual GPU. Any 
> > ideas?
> Make isDelayedVariableLengthDecl virtual in base OpenMPRuntime and make it 
> return false by default, and true in base implementation for GPU. This should 
> fix the problem, I hope
It worked thank you for the suggestion!! 


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D153883/new/

https://reviews.llvm.org/D153883

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D153883: [Clang][OpenMP] Delay emission of __kmpc_alloc_shared for escaped VLAs

2023-06-30 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
doru1004 updated this revision to Diff 536288.

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D153883/new/

https://reviews.llvm.org/D153883

Files:
  clang/lib/CodeGen/CGDecl.cpp
  clang/lib/CodeGen/CGOpenMPRuntime.h
  clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
  clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
  clang/lib/CodeGen/CodeGenFunction.cpp
  clang/lib/CodeGen/CodeGenFunction.h
  clang/test/OpenMP/amdgcn_target_device_vla.cpp

Index: clang/test/OpenMP/amdgcn_target_device_vla.cpp
===
--- /dev/null
+++ clang/test/OpenMP/amdgcn_target_device_vla.cpp
@@ -0,0 +1,1260 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+int foo1() {
+  int sum = 0.0;
+  #pragma omp target map(tofrom: sum)
+  {
+int N = 10;
+int A[N];
+
+for (int i = 0; i < N; i++)
+  A[i] = i;
+
+for (int i = 0; i < N; i++)
+  sum += A[i];
+  }
+  return sum;
+}
+
+int foo2() {
+  int sum = 0.0;
+  int M = 12;
+  int result[M];
+  #pragma omp target teams distribute parallel for map(from: result[:M])
+  for (int i = 0; i < M; i++) {
+int N = 10;
+int A[N];
+result[i] = i;
+
+for (int j = 0; j < N; j++)
+  A[j] = j;
+
+for (int j = 0; j < N; j++)
+  result[i] += A[j];
+  }
+
+  for (int i = 0; i < M; i++)
+sum += result[i];
+  return sum;
+}
+
+int foo3() {
+  int sum = 0.0;
+  int M = 12;
+  int result[M];
+  #pragma omp target teams distribute map(from: result[:M])
+  for (int i = 0; i < M; i++) {
+int N = 10;
+int A[N];
+result[i] = i;
+
+#pragma omp parallel for
+for (int j = 0; j < N; j++)
+  A[j] = j;
+
+for (int j = 0; j < N; j++)
+  result[i] += A[j];
+  }
+
+  for (int i = 0; i < M; i++)
+sum += result[i];
+  return sum;
+}
+
+int foo4() {
+  int sum = 0.0;
+  int M = 12;
+  int result[M];
+  int N = 10;
+  #pragma omp target teams distribute map(from: result[:M])
+  for (int i = 0; i < M; i++) {
+int A[N];
+result[i] = i;
+
+#pragma omp parallel for
+for (int j = 0; j < N; j++)
+  A[j] = j;
+
+for (int j = 0; j < N; j++)
+  result[i] += A[j];
+  }
+
+  for (int i = 0; i < M; i++)
+sum += result[i];
+  return sum;
+}
+
+int main() {
+  return foo1() + foo2() + foo3() + foo4();
+}
+
+#endif
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo1v_l12
+// CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[SUM:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:[[SUM_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT:[[N:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:[[__VLA_EXPR0:%.*]] = alloca i64, align 8, addrspace(5)
+// CHECK-NEXT:[[I:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:[[I1:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:[[SUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR]] to ptr
+// CHECK-NEXT:[[N_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N]] to ptr
+// CHECK-NEXT:[[__VLA_EXPR0_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__VLA_EXPR0]] to ptr
+// CHECK-NEXT:[[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr
+// CHECK-NEXT:[[I1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I1]] to ptr
+// CHECK-NEXT:store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8
+// CHECK-NEXT:[[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR_ASCAST]], align 8
+// CHECK-NEXT:[[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @[[GLOB1:[0-9]+]] to ptr), i8 1, i1 true)
+// CHECK-NEXT:[[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK-NEXT:br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK:   user_code.entry:
+// CHECK-NEXT:store i32 10, ptr [[N_ASCAST]], align 4
+// CHECK-NEXT:[[TMP2:%.*]] = load i32, ptr [[N_ASCAST]], align 4
+// CHECK-NEXT:[[TMP3:%.*]] = zext i32 [[TMP2]] to i64
+// CHECK-NEXT:[[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
+// CHECK-NEXT:[[TMP5:%.*]] = add nuw i64 [[TMP4]], 3
+// CHECK-NEXT:[[TMP6:%.*]] = udiv i64 [[TMP5]], 4
+// CHECK-NEXT:[[TMP7:%.*]] = mul nuw i64 [[TMP6]], 4
+// CHECK-NEXT:[[A:%.*]] = call align 4 ptr @__kmpc_alloc_shared(i64 [[TMP7]])

[PATCH] D153883: [Clang][OpenMP] Delay emission of __kmpc_alloc_shared for escaped VLAs

2023-06-30 Thread Alexey Bataev via Phabricator via cfe-commits
ABataev added inline comments.



Comment at: clang/lib/CodeGen/CGDecl.cpp:1605-1609
+(CGM.getContext().getTargetInfo().getTriple().isAMDGPU() ||
+ CGM.getContext().getTargetInfo().getTriple().isNVPTX())) {
+  CGOpenMPRuntimeGPU  =
+  *(static_cast(()));
+  if (RT.isDelayedVariableLengthDecl(*this, )) {

doru1004 wrote:
> ABataev wrote:
> > I think you can drop triple checks and rely completely on 
> > RT.isDelayedVariableLengthDecl(*this, ) result here
> I tried it but there is a lit test (which I cannot identify) that hangs when 
> offloading to the host (I think) so it has to be an actual GPU. Any ideas?
Make isDelayedVariableLengthDecl virtual in base OpenMPRuntime and make it 
return false by default, and true in base implementation for GPU. This should 
fix the problem, I hope


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D153883/new/

https://reviews.llvm.org/D153883

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D153883: [Clang][OpenMP] Delay emission of __kmpc_alloc_shared for escaped VLAs

2023-06-30 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
doru1004 added inline comments.



Comment at: clang/lib/CodeGen/CGDecl.cpp:1605-1609
+(CGM.getContext().getTargetInfo().getTriple().isAMDGPU() ||
+ CGM.getContext().getTargetInfo().getTriple().isNVPTX())) {
+  CGOpenMPRuntimeGPU  =
+  *(static_cast(()));
+  if (RT.isDelayedVariableLengthDecl(*this, )) {

ABataev wrote:
> I think you can drop triple checks and rely completely on 
> RT.isDelayedVariableLengthDecl(*this, ) result here
I tried it but there is a lit test (which I cannot identify) that hangs when 
offloading to the host (I think) so it has to be an actual GPU. Any ideas?


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D153883/new/

https://reviews.llvm.org/D153883

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D153883: [Clang][OpenMP] Delay emission of __kmpc_alloc_shared for escaped VLAs

2023-06-30 Thread Alexey Bataev via Phabricator via cfe-commits
ABataev added inline comments.



Comment at: clang/lib/CodeGen/CGDecl.cpp:589
+: AddrSizePair(AddrSizePair) {}
+void Emit(CodeGenFunction , Flags Flags) override {
+  CGOpenMPRuntimeGPU  =

Name of the variable hides the type, potential warning or even error



Comment at: clang/lib/CodeGen/CGDecl.cpp:1605-1609
+(CGM.getContext().getTargetInfo().getTriple().isAMDGPU() ||
+ CGM.getContext().getTargetInfo().getTriple().isNVPTX())) {
+  CGOpenMPRuntimeGPU  =
+  *(static_cast(()));
+  if (RT.isDelayedVariableLengthDecl(*this, )) {

I think you can drop triple checks and rely completely on 
RT.isDelayedVariableLengthDecl(*this, ) result here



Comment at: clang/lib/CodeGen/CodeGenFunction.cpp:2164-2174
+bool CodeGenFunction::hasVLASize(const VariableArrayType *type) {
+  QualType elementType;
+  do {
+elementType = type->getElementType();
+llvm::Value *vlaSize = VLASizeMap[type->getSizeExpr()];
+if (!vlaSize)
+  return false;

Fix var naming


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D153883/new/

https://reviews.llvm.org/D153883

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D153883: [Clang][OpenMP] Delay emission of __kmpc_alloc_shared for escaped VLAs

2023-06-29 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
doru1004 added a comment.

I have modified the patch to only do one thing rather than several things as 
the previous patch. Essentially this patch now only handles the delayed 
emission of the __kmpc_alloc_shared for the VLA which it could not emit in the 
Prolog of the function. This is now very precise in terms of which VLAs it will 
transform into __kmpc_alloc_shared i.e. only the ones that were previously 
attempted in the Prolog and could not be emitted because their size was missing 
(had not been emitted yet).

I have dropped the previous intention of emitting __kmpc_alloc_shared for 
thread local variables which have dynamic size. I am emitting dynamic allocas 
(as the test shows) which will fail in the backend as expected. This behavior 
needs to be resolved separately in the backend according to @arsenm  and any 
workaround in the frontend would have to live in a standalone patch that can be 
reverted when a fix to the backend is performed.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D153883/new/

https://reviews.llvm.org/D153883

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D153883: [Clang][OpenMP] Delay emission of __kmpc_alloc_shared for escaped VLAs

2023-06-29 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
doru1004 updated this revision to Diff 536059.
doru1004 retitled this revision from "[Clang][OpenMP] Enable use of 
__kmpc_alloc_shared for VLAs defined in AMD GPU offloaded regions" to 
"[Clang][OpenMP] Delay emission of __kmpc_alloc_shared for escaped VLAs ".
doru1004 edited the summary of this revision.

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D153883/new/

https://reviews.llvm.org/D153883

Files:
  clang/lib/CodeGen/CGDecl.cpp
  clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
  clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
  clang/lib/CodeGen/CodeGenFunction.cpp
  clang/lib/CodeGen/CodeGenFunction.h
  clang/test/OpenMP/amdgcn_target_device_vla.cpp

Index: clang/test/OpenMP/amdgcn_target_device_vla.cpp
===
--- /dev/null
+++ clang/test/OpenMP/amdgcn_target_device_vla.cpp
@@ -0,0 +1,1260 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+int foo1() {
+  int sum = 0.0;
+  #pragma omp target map(tofrom: sum)
+  {
+int N = 10;
+int A[N];
+
+for (int i = 0; i < N; i++)
+  A[i] = i;
+
+for (int i = 0; i < N; i++)
+  sum += A[i];
+  }
+  return sum;
+}
+
+int foo2() {
+  int sum = 0.0;
+  int M = 12;
+  int result[M];
+  #pragma omp target teams distribute parallel for map(from: result[:M])
+  for (int i = 0; i < M; i++) {
+int N = 10;
+int A[N];
+result[i] = i;
+
+for (int j = 0; j < N; j++)
+  A[j] = j;
+
+for (int j = 0; j < N; j++)
+  result[i] += A[j];
+  }
+
+  for (int i = 0; i < M; i++)
+sum += result[i];
+  return sum;
+}
+
+int foo3() {
+  int sum = 0.0;
+  int M = 12;
+  int result[M];
+  #pragma omp target teams distribute map(from: result[:M])
+  for (int i = 0; i < M; i++) {
+int N = 10;
+int A[N];
+result[i] = i;
+
+#pragma omp parallel for
+for (int j = 0; j < N; j++)
+  A[j] = j;
+
+for (int j = 0; j < N; j++)
+  result[i] += A[j];
+  }
+
+  for (int i = 0; i < M; i++)
+sum += result[i];
+  return sum;
+}
+
+int foo4() {
+  int sum = 0.0;
+  int M = 12;
+  int result[M];
+  int N = 10;
+  #pragma omp target teams distribute map(from: result[:M])
+  for (int i = 0; i < M; i++) {
+int A[N];
+result[i] = i;
+
+#pragma omp parallel for
+for (int j = 0; j < N; j++)
+  A[j] = j;
+
+for (int j = 0; j < N; j++)
+  result[i] += A[j];
+  }
+
+  for (int i = 0; i < M; i++)
+sum += result[i];
+  return sum;
+}
+
+int main() {
+  return foo1() + foo2() + foo3() + foo4();
+}
+
+#endif
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo1v_l12
+// CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[SUM:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:[[SUM_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT:[[N:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:[[__VLA_EXPR0:%.*]] = alloca i64, align 8, addrspace(5)
+// CHECK-NEXT:[[I:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:[[I1:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:[[SUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR]] to ptr
+// CHECK-NEXT:[[N_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N]] to ptr
+// CHECK-NEXT:[[__VLA_EXPR0_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__VLA_EXPR0]] to ptr
+// CHECK-NEXT:[[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr
+// CHECK-NEXT:[[I1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I1]] to ptr
+// CHECK-NEXT:store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8
+// CHECK-NEXT:[[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR_ASCAST]], align 8
+// CHECK-NEXT:[[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @[[GLOB1:[0-9]+]] to ptr), i8 1, i1 true)
+// CHECK-NEXT:[[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK-NEXT:br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK:   user_code.entry:
+// CHECK-NEXT:store i32 10, ptr [[N_ASCAST]], align 4
+// CHECK-NEXT:[[TMP2:%.*]] = load i32, ptr [[N_ASCAST]], align 4
+// CHECK-NEXT:[[TMP3:%.*]] = zext i32 [[TMP2]] to i64
+// CHECK-NEXT:[[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
+// CHECK-NEXT:[[TMP5:%.*]]