ABataev created this revision.
ABataev added reviewers: rjmccall, hfinkel.
Herald added subscribers: arphaman, guansong.
Herald added a reviewer: jdoerfert.
Herald added a project: clang.

Basic codegen for the declarations marked as nontemporal. Also, if the
base declaration in the member expression is marked as nontemporal,
lvalue for member decl access inherits nonteporal flag from the base
lvalue.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D71708

Files:
  clang/include/clang/AST/OpenMPClause.h
  clang/include/clang/AST/RecursiveASTVisitor.h
  clang/lib/AST/OpenMPClause.cpp
  clang/lib/AST/StmtProfile.cpp
  clang/lib/CodeGen/CGExpr.cpp
  clang/lib/CodeGen/CGOpenMPRuntime.cpp
  clang/lib/CodeGen/CGOpenMPRuntime.h
  clang/lib/CodeGen/CGStmtOpenMP.cpp
  clang/lib/Sema/SemaOpenMP.cpp
  clang/lib/Serialization/ASTReader.cpp
  clang/lib/Serialization/ASTWriter.cpp
  clang/test/OpenMP/distribute_simd_codegen.cpp
  clang/test/OpenMP/for_simd_codegen.cpp
  clang/test/OpenMP/simd_codegen.cpp
  clang/test/OpenMP/target_parallel_for_simd_codegen.cpp
  clang/test/OpenMP/target_simd_codegen.cpp
  clang/test/OpenMP/target_teams_distribute_parallel_for_simd_if_codegen.cpp
  clang/test/OpenMP/target_teams_distribute_simd_codegen.cpp
  clang/test/OpenMP/teams_distribute_simd_codegen.cpp
  clang/tools/libclang/CIndex.cpp

Index: clang/tools/libclang/CIndex.cpp
===================================================================
--- clang/tools/libclang/CIndex.cpp
+++ clang/tools/libclang/CIndex.cpp
@@ -2458,6 +2458,8 @@
 void OMPClauseEnqueue::VisitOMPNontemporalClause(
     const OMPNontemporalClause *C) {
   VisitOMPClauseList(C);
+  for (const auto *E : C->private_refs())
+    Visitor->AddStmt(E);
 }
 }
 
Index: clang/test/OpenMP/teams_distribute_simd_codegen.cpp
===================================================================
--- clang/test/OpenMP/teams_distribute_simd_codegen.cpp
+++ clang/test/OpenMP/teams_distribute_simd_codegen.cpp
@@ -177,16 +177,16 @@
   // CK3: define {{.*}}i32 @{{.+}}foo{{.+}}(
   int foo(void) {
 
-  // CK3: call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 2, i8** %{{[^,]+}}, i8** %{{[^,]+}}, i{{64|32}}* %{{.+}}, i64* {{.+}}@{{[^,]+}}, i32 0, i32 0), i32 0, i32 1)
+  // CK3: call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 3, i8** %{{[^,]+}}, i8** %{{[^,]+}}, i{{64|32}}* %{{.+}}, i64* {{.+}}@{{[^,]+}}, i32 0, i32 0), i32 0, i32 1)
   // CK3: call void @[[OFFL1:.+]]([[SSI]]* %{{.+}})
     #pragma omp target
 #ifdef OMP5
-    #pragma omp teams distribute simd if(b)
+    #pragma omp teams distribute simd if(b) nontemporal(a, b)
 #else
     #pragma omp teams distribute simd
 #endif // OMP5
     for(int i = 0; i < X; i++) {
-      a[i] = (T)0;
+      a[i] = (T)b;
     }
 
       // outlined target region
@@ -197,6 +197,8 @@
 
   // CK3: define internal void @[[OUTL1]]({{.+}})
   // CK3: call void @__kmpc_for_static_init_4(
+  // OMP3_45-NOT: !nontemporal
+  // OMP3_50: load float,{{.*}}!nontemporal
   // CK3: call void @__kmpc_for_static_fini(
   // CK3: ret void
 
Index: clang/test/OpenMP/target_teams_distribute_simd_codegen.cpp
===================================================================
--- clang/test/OpenMP/target_teams_distribute_simd_codegen.cpp
+++ clang/test/OpenMP/target_teams_distribute_simd_codegen.cpp
@@ -165,7 +165,7 @@
 
   // CHECK:       call void [[HVT1:@.+]](i[[SZ]] {{[^,]+}})
 #ifdef OMP5
-  #pragma omp target teams distribute simd if(target: 0) safelen(32) linear(a) if(simd: 1)
+  #pragma omp target teams distribute simd if(target: 0) safelen(32) linear(a) if(simd: 1) nontemporal(a)
 #else
   #pragma omp target teams distribute simd if(target: 0) safelen(32) linear(a)
 #endif // OMP5
@@ -395,6 +395,8 @@
 // CHECK:       [[AA_ADDR:%.+]] = alloca i[[SZ]], align
 // CHECK:       store i[[SZ]] %{{.+}}, i[[SZ]]* [[AA_ADDR]], align
 // CHECK-64:    [[AA_CADDR:%.+]] = bitcast i[[SZ]]* [[AA_ADDR]] to i32*
+// OMP45-NOT:   !nontemporal
+// OMP50:       load i32,{{.*}}!nontemporal
 // CHECK-64:    store i32 10, i32* [[AA_CADDR]], align
 // CHECK-32:    store i32 10, i32* [[AA_ADDR]], align
 // CHECK:       ret void
Index: clang/test/OpenMP/target_teams_distribute_parallel_for_simd_if_codegen.cpp
===================================================================
--- clang/test/OpenMP/target_teams_distribute_parallel_for_simd_if_codegen.cpp
+++ clang/test/OpenMP/target_teams_distribute_parallel_for_simd_if_codegen.cpp
@@ -35,20 +35,25 @@
 // CHECK: call i{{[0-9]+}} @__tgt_target_teams(
 // CHECK: call void [[OFFLOADING_FUN_1:@.+]](
 #ifdef OMP5
-#pragma omp target teams distribute parallel for simd if(simd: true)
+#pragma omp target teams distribute parallel for simd if(simd: true) nontemporal(Arg)
 #else
 #pragma omp target teams distribute parallel for simd
 #endif // OMP5
-  for(int i = 0 ; i < 100; i++) {}
+  for (int i = 0; i < 100; i++) {
+    Arg = 0;
+  }
   // CHECK: define internal void [[OFFLOADING_FUN_0]](
-  // CHECK: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, i32 0, {{.+}}* [[OMP_TEAMS_OUTLINED_0:@.+]] to {{.+}})
+  // CHECK: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, i32 1, {{.+}}* [[OMP_TEAMS_OUTLINED_0:@.+]] to {{.+}})
   // CHECK: define{{.+}} void [[OMP_TEAMS_OUTLINED_0]](
   // CHECK: call void @__kmpc_for_static_init_4(
-  // CHECK:  call void {{.+}} @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{.+}} 2, {{.+}}* [[OMP_OUTLINED_0:@.+]] to void
+  // OMP50: load i32,{{.*}}!nontemporal
+  // CHECK: call void {{.+}} @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{.+}} 3, {{.+}}* [[OMP_OUTLINED_0:@.+]] to void
   // CHECK: call void @__kmpc_for_static_fini(
 
   // CHECK: define{{.+}} void [[OMP_OUTLINED_0]](
   // CHECK: call void @__kmpc_for_static_init_4(
+  // OMP45-NOT: !nontemporal
+  // OMP50: store i32 0,{{.*}}!nontemporal
   // CHECK: call void @__kmpc_for_static_fini(
   // CHECK: ret
 #pragma omp target teams distribute parallel for simd if (parallel: false)
Index: clang/test/OpenMP/target_simd_codegen.cpp
===================================================================
--- clang/test/OpenMP/target_simd_codegen.cpp
+++ clang/test/OpenMP/target_simd_codegen.cpp
@@ -85,8 +85,8 @@
 // CHECK-DAG: [[MAPT5:@.+]] = private unnamed_addr constant [3 x i64] [i64 800, i64 800, i64 547]
 // CHECK-DAG: [[SIZET6:@.+]] = private unnamed_addr constant [4 x i64] [i64 4, i64 2, i64 1, i64 40]
 // CHECK-DAG: [[MAPT6:@.+]] = private unnamed_addr constant [4 x i64] [i64 800, i64 800, i64 800, i64 547]
-// OMP45-DAG: [[MAPT7:@.+]] = private unnamed_addr constant [6 x i64] [i64 32, i64 281474976711171, i64 800, i64 800, i64 800, i64 547]
-// OMP50-DAG: [[MAPT7:@.+]] = private unnamed_addr constant [7 x i64] [i64 32, i64 281474976711171, i64 800, i64 800, i64 800, i64 547, i64 800]
+// OMP45-DAG: [[MAPT7:@.+]] = private unnamed_addr constant [5 x i64] [i64 547, i64 800, i64 800, i64 800, i64 547]
+// OMP50-DAG: [[MAPT7:@.+]] = private unnamed_addr constant [6 x i64] [i64 547, i64 800, i64 800, i64 800, i64 547, i64 800]
 // CHECK-DAG: @{{.*}} = weak constant i8 0
 // CHECK-DAG: @{{.*}} = weak constant i8 0
 // CHECK-DAG: @{{.*}} = weak constant i8 0
@@ -461,7 +461,11 @@
     int b = n+1;
     short int c[2][n];
 
-    #pragma omp target simd if(n>60)
+#ifdef OMP5
+    #pragma omp target simd if(n>60) nontemporal(a) private(a)
+#else
+    #pragma omp target simd if(n>60) private(a)
+#endif // OMP5
     for (unsigned long long it = 2000; it >= 600; it -= 400) {
       this->a = (double)b + 1.5;
       c[1][1] = ++a;
@@ -519,96 +523,84 @@
 // CHECK-32:    [[CSZSIZE:%.+]] = mul nuw i32 [[CELEMSIZE2]], 2
 // CHECK-32:    [[CSIZE:%.+]] = sext i32 [[CSZSIZE]] to i64
 
-// OMP45-DAG:   [[RET:%.+]] = call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 6, i8** [[BPR:%[^,]+]], i8** [[PR:%[^,]+]], i64* [[SR:%[^,]+]], i64* getelementptr inbounds ([6 x i64], [6 x i64]* [[MAPT7]], i32 0, i32 0), i32 1, i32 1)
-// OMP50-DAG:   [[RET:%.+]] = call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 7, i8** [[BPR:%[^,]+]], i8** [[PR:%[^,]+]], i64* [[SR:%[^,]+]], i64* getelementptr inbounds ([7 x  i64], [7 x  i64]* [[MAPT7]], i32 0, i32 0), i32 1, i32 1)
-// OMP45-DAG:   [[BPR]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[BP:%.+]], i32 0, i32 0
-// OMP45-DAG:   [[PR]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[P:%.+]], i32 0, i32 0
-// OMP45-DAG:   [[SR]] = getelementptr inbounds [6 x i64], [6 x i64]* [[S:%.+]], i32 0, i32 0
-// OMP45-DAG:   [[SADDR0:%.+]] = getelementptr inbounds [6 x i64], [6 x i64]* [[S]], i32 [[IDX0:[0-9]+]]
-// OMP45-DAG:   [[BPADDR0:%.+]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[BP]], i32 [[IDX0]]
-// OMP45-DAG:   [[PADDR0:%.+]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[P]], i32 [[IDX0]]
-// OMP45-DAG:   [[SADDR1:%.+]] = getelementptr inbounds [6 x i64], [6 x i64]* [[S]], i32 [[IDX1:[0-9]+]]
-// OMP45-DAG:   [[BPADDR1:%.+]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[BP]], i32 [[IDX1]]
-// OMP45-DAG:   [[PADDR1:%.+]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[P]], i32 [[IDX1]]
-// OMP45-DAG:   [[SADDR2:%.+]] = getelementptr inbounds [6 x i64], [6 x i64]* [[S]], i32 [[IDX2:[0-9]+]]
-// OMP45-DAG:   [[BPADDR2:%.+]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[BP]], i32 [[IDX2]]
-// OMP45-DAG:   [[PADDR2:%.+]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[P]], i32 [[IDX2]]
-// OMP45-DAG:   [[SADDR3:%.+]] = getelementptr inbounds [6 x i64], [6 x i64]* [[S]], i32 [[IDX3:[0-9]+]]
-// OMP45-DAG:   [[BPADDR3:%.+]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[BP]], i32 [[IDX3]]
-// OMP45-DAG:   [[PADDR3:%.+]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[P]], i32 [[IDX3]]
-// OMP45-DAG:   [[SADDR4:%.+]] = getelementptr inbounds [6 x i64], [6 x i64]* [[S]], i32 [[IDX4:[0-9]+]]
-// OMP45-DAG:   [[BPADDR4:%.+]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[BP]], i32 [[IDX4]]
-// OMP45-DAG:   [[PADDR4:%.+]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[P]], i32 [[IDX4]]
-// OMP45-DAG:   [[SADDR5:%.+]] = getelementptr inbounds [6 x i64], [6 x i64]* [[S]], i32 [[IDX5:[0-9]+]]
-// OMP45-DAG:   [[BPADDR5:%.+]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[BP]], i32 [[IDX5]]
-// OMP45-DAG:   [[PADDR5:%.+]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[P]], i32 [[IDX5]]
-// OMP50-DAG:   [[BPR]] = getelementptr inbounds [7 x  i8*], [7 x  i8*]* [[BP:%.+]], i32 0, i32 0
-// OMP50-DAG:   [[PR]] = getelementptr inbounds [7 x  i8*], [7 x  i8*]* [[P:%.+]], i32 0, i32 0
-// OMP50-DAG:   [[SR]] = getelementptr inbounds [7 x  i64], [7 x  i64]* [[S:%.+]], i32 0, i32 0
-// OMP50-DAG:   [[SADDR0:%.+]] = getelementptr inbounds [7 x  i64], [7 x  i64]* [[S]], i32 [[IDX0:[0-9]+]]
-// OMP50-DAG:   [[BPADDR0:%.+]] = getelementptr inbounds [7 x  i8*], [7 x  i8*]* [[BP]], i32 [[IDX0]]
-// OMP50-DAG:   [[PADDR0:%.+]] = getelementptr inbounds [7 x  i8*], [7 x  i8*]* [[P]], i32 [[IDX0]]
-// OMP50-DAG:   [[SADDR1:%.+]] = getelementptr inbounds [7 x  i64], [7 x  i64]* [[S]], i32 [[IDX1:[0-9]+]]
-// OMP50-DAG:   [[BPADDR1:%.+]] = getelementptr inbounds [7 x  i8*], [7 x  i8*]* [[BP]], i32 [[IDX1]]
-// OMP50-DAG:   [[PADDR1:%.+]] = getelementptr inbounds [7 x  i8*], [7 x  i8*]* [[P]], i32 [[IDX1]]
-// OMP50-DAG:   [[SADDR2:%.+]] = getelementptr inbounds [7 x  i64], [7 x  i64]* [[S]], i32 [[IDX2:[0-9]+]]
-// OMP50-DAG:   [[BPADDR2:%.+]] = getelementptr inbounds [7 x  i8*], [7 x  i8*]* [[BP]], i32 [[IDX2]]
-// OMP50-DAG:   [[PADDR2:%.+]] = getelementptr inbounds [7 x  i8*], [7 x  i8*]* [[P]], i32 [[IDX2]]
-// OMP50-DAG:   [[SADDR3:%.+]] = getelementptr inbounds [7 x  i64], [7 x  i64]* [[S]], i32 [[IDX3:[0-9]+]]
-// OMP50-DAG:   [[BPADDR3:%.+]] = getelementptr inbounds [7 x  i8*], [7 x  i8*]* [[BP]], i32 [[IDX3]]
-// OMP50-DAG:   [[PADDR3:%.+]] = getelementptr inbounds [7 x  i8*], [7 x  i8*]* [[P]], i32 [[IDX3]]
-// OMP50-DAG:   [[SADDR4:%.+]] = getelementptr inbounds [7 x  i64], [7 x  i64]* [[S]], i32 [[IDX4:[0-9]+]]
-// OMP50-DAG:   [[BPADDR4:%.+]] = getelementptr inbounds [7 x  i8*], [7 x  i8*]* [[BP]], i32 [[IDX4]]
-// OMP50-DAG:   [[PADDR4:%.+]] = getelementptr inbounds [7 x  i8*], [7 x  i8*]* [[P]], i32 [[IDX4]]
-// OMP50-DAG:   [[SADDR5:%.+]] = getelementptr inbounds [7 x  i64], [7 x  i64]* [[S]], i32 [[IDX5:[0-9]+]]
-// OMP50-DAG:   [[BPADDR5:%.+]] = getelementptr inbounds [7 x  i8*], [7 x  i8*]* [[BP]], i32 [[IDX5]]
-// OMP50-DAG:   [[PADDR5:%.+]] = getelementptr inbounds [7 x  i8*], [7 x  i8*]* [[P]], i32 [[IDX5]]
-// OMP50-DAG:   [[SADDR6:%.+]] = getelementptr inbounds [7 x  i64], [7 x  i64]* [[S]], i32 [[IDX6:[0-9]+]]
-// OMP50-DAG:   [[BPADDR6:%.+]] = getelementptr inbounds [7 x  i8*], [7 x  i8*]* [[BP]], i32 [[IDX6]]
-// OMP50-DAG:   [[PADDR6:%.+]] = getelementptr inbounds [7 x  i8*], [7 x  i8*]* [[P]], i32 [[IDX6]]
+// OMP45-DAG:   [[RET:%.+]] = call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 5, i8** [[BPR:%[^,]+]], i8** [[PR:%[^,]+]], i64* [[SR:%[^,]+]], i64* getelementptr inbounds ([5 x i64], [5 x i64]* [[MAPT7]], i32 0, i32 0), i32 1, i32 1)
+// OMP50-DAG:   [[RET:%.+]] = call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 6, i8** [[BPR:%[^,]+]], i8** [[PR:%[^,]+]], i64* [[SR:%[^,]+]], i64* getelementptr inbounds ([6 x  i64], [6 x  i64]* [[MAPT7]], i32 0, i32 0), i32 1, i32 1)
+// OMP45-DAG:   [[BPR]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BP:%.+]], i32 0, i32 0
+// OMP45-DAG:   [[PR]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[P:%.+]], i32 0, i32 0
+// OMP45-DAG:   [[SR]] = getelementptr inbounds [5 x i64], [5 x i64]* [[S:%.+]], i32 0, i32 0
+// OMP45-DAG:   [[SADDR0:%.+]] = getelementptr inbounds [5 x i64], [5 x i64]* [[S]], i32 [[IDX0:[0-9]+]]
+// OMP45-DAG:   [[BPADDR0:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BP]], i32 [[IDX0]]
+// OMP45-DAG:   [[PADDR0:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[P]], i32 [[IDX0]]
+// OMP45-DAG:   [[SADDR1:%.+]] = getelementptr inbounds [5 x i64], [5 x i64]* [[S]], i32 [[IDX1:[0-9]+]]
+// OMP45-DAG:   [[BPADDR1:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BP]], i32 [[IDX1]]
+// OMP45-DAG:   [[PADDR1:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[P]], i32 [[IDX1]]
+// OMP45-DAG:   [[SADDR2:%.+]] = getelementptr inbounds [5 x i64], [5 x i64]* [[S]], i32 [[IDX2:[0-9]+]]
+// OMP45-DAG:   [[BPADDR2:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BP]], i32 [[IDX2]]
+// OMP45-DAG:   [[PADDR2:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[P]], i32 [[IDX2]]
+// OMP45-DAG:   [[SADDR3:%.+]] = getelementptr inbounds [5 x i64], [5 x i64]* [[S]], i32 [[IDX3:[0-9]+]]
+// OMP45-DAG:   [[BPADDR3:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BP]], i32 [[IDX3]]
+// OMP45-DAG:   [[PADDR3:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[P]], i32 [[IDX3]]
+// OMP45-DAG:   [[SADDR4:%.+]] = getelementptr inbounds [5 x i64], [5 x i64]* [[S]], i32 [[IDX4:[0-9]+]]
+// OMP45-DAG:   [[BPADDR4:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BP]], i32 [[IDX4]]
+// OMP45-DAG:   [[PADDR4:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[P]], i32 [[IDX4]]
+// OMP50-DAG:   [[BPR]] = getelementptr inbounds [6 x  i8*], [6 x  i8*]* [[BP:%.+]], i32 0, i32 0
+// OMP50-DAG:   [[PR]] = getelementptr inbounds [6 x  i8*], [6 x  i8*]* [[P:%.+]], i32 0, i32 0
+// OMP50-DAG:   [[SR]] = getelementptr inbounds [6 x  i64], [6 x  i64]* [[S:%.+]], i32 0, i32 0
+// OMP50-DAG:   [[SADDR0:%.+]] = getelementptr inbounds [6 x  i64], [6 x  i64]* [[S]], i32 [[IDX0:[0-9]+]]
+// OMP50-DAG:   [[BPADDR0:%.+]] = getelementptr inbounds [6 x  i8*], [6 x  i8*]* [[BP]], i32 [[IDX0]]
+// OMP50-DAG:   [[PADDR0:%.+]] = getelementptr inbounds [6 x  i8*], [6 x  i8*]* [[P]], i32 [[IDX0]]
+// OMP50-DAG:   [[SADDR1:%.+]] = getelementptr inbounds [6 x  i64], [6 x  i64]* [[S]], i32 [[IDX1:[0-9]+]]
+// OMP50-DAG:   [[BPADDR1:%.+]] = getelementptr inbounds [6 x  i8*], [6 x  i8*]* [[BP]], i32 [[IDX1]]
+// OMP50-DAG:   [[PADDR1:%.+]] = getelementptr inbounds [6 x  i8*], [6 x  i8*]* [[P]], i32 [[IDX1]]
+// OMP50-DAG:   [[SADDR2:%.+]] = getelementptr inbounds [6 x  i64], [6 x  i64]* [[S]], i32 [[IDX2:[0-9]+]]
+// OMP50-DAG:   [[BPADDR2:%.+]] = getelementptr inbounds [6 x  i8*], [6 x  i8*]* [[BP]], i32 [[IDX2]]
+// OMP50-DAG:   [[PADDR2:%.+]] = getelementptr inbounds [6 x  i8*], [6 x  i8*]* [[P]], i32 [[IDX2]]
+// OMP50-DAG:   [[SADDR3:%.+]] = getelementptr inbounds [6 x  i64], [6 x  i64]* [[S]], i32 [[IDX3:[0-9]+]]
+// OMP50-DAG:   [[BPADDR3:%.+]] = getelementptr inbounds [6 x  i8*], [6 x  i8*]* [[BP]], i32 [[IDX3]]
+// OMP50-DAG:   [[PADDR3:%.+]] = getelementptr inbounds [6 x  i8*], [6 x  i8*]* [[P]], i32 [[IDX3]]
+// OMP50-DAG:   [[SADDR4:%.+]] = getelementptr inbounds [6 x  i64], [6 x  i64]* [[S]], i32 [[IDX4:[0-9]+]]
+// OMP50-DAG:   [[BPADDR4:%.+]] = getelementptr inbounds [6 x  i8*], [6 x  i8*]* [[BP]], i32 [[IDX4]]
+// OMP50-DAG:   [[PADDR4:%.+]] = getelementptr inbounds [6 x  i8*], [6 x  i8*]* [[P]], i32 [[IDX4]]
+// OMP50-DAG:   [[SADDR5:%.+]] = getelementptr inbounds [6 x  i64], [6 x  i64]* [[S]], i32 [[IDX5:[0-9]+]]
+// OMP50-DAG:   [[BPADDR5:%.+]] = getelementptr inbounds [6 x  i8*], [6 x  i8*]* [[BP]], i32 [[IDX5]]
+// OMP50-DAG:   [[PADDR5:%.+]] = getelementptr inbounds [6 x  i8*], [6 x  i8*]* [[P]], i32 [[IDX5]]
 
 // The names below are not necessarily consistent with the names used for the
 // addresses above as some are repeated.
 // CHECK-DAG:   store [[S1]]* %{{.+}}, [[S1]]** [[CBPADDR0:%.+]],
-// CHECK-DAG:   store double* %{{.+}}, double** [[CPADDR0:%.+]],
+// CHECK-DAG:   store [[S1]]* %{{.+}}, [[S1]]** [[CPADDR0:%.+]],
 // CHECK-DAG:   [[CBPADDR0]] = bitcast i8** {{%[^,]+}} to [[S1]]**
-// CHECK-DAG:   [[CPADDR0]] = bitcast i8** {{%[^,]+}} to double**
-// CHECK-DAG:   store i64 %{{.+}}, i64* {{%[^,]+}}
-
-// CHECK-DAG:   store [[S1]]* %{{.+}}, [[S1]]** [[CBPADDR1:%.+]],
-// CHECK-DAG:   store double* %{{.+}}, double** [[CPADDR1:%.+]],
-// CHECK-DAG:   [[CBPADDR1]] = bitcast i8** {{%[^,]+}} to [[S1]]**
-// CHECK-DAG:   [[CPADDR1]] = bitcast i8** {{%[^,]+}} to double**
+// CHECK-DAG:   [[CPADDR0]] = bitcast i8** {{%[^,]+}} to [[S1]]**
 // CHECK-DAG:   store i64 {{4|8}}, i64* {{%[^,]+}}
 
-// CHECK-DAG:   store i[[SZ]] [[B_CVAL]], i[[SZ]]* [[CBPADDR2:%.+]],
-// CHECK-DAG:   store i[[SZ]] [[B_CVAL]], i[[SZ]]* [[CPADDR2:%.+]],
+// CHECK-DAG:   store i[[SZ]] [[B_CVAL]], i[[SZ]]* [[CBPADDR1:%.+]],
+// CHECK-DAG:   store i[[SZ]] [[B_CVAL]], i[[SZ]]* [[CPADDR1:%.+]],
+// CHECK-DAG:   [[CBPADDR1]] = bitcast i8** {{%[^,]+}} to i[[SZ]]*
+// CHECK-DAG:   [[CPADDR1]] = bitcast i8** {{%[^,]+}} to i[[SZ]]*
+// CHECK-DAG:   store i64 4, i64* {{%[^,]+}}
+
+// CHECK-DAG:   store i[[SZ]] 2, i[[SZ]]* [[CBPADDR2:%.+]],
+// CHECK-DAG:   store i[[SZ]] 2, i[[SZ]]* [[CPADDR2:%.+]],
 // CHECK-DAG:   [[CBPADDR2]] = bitcast i8** {{%[^,]+}} to i[[SZ]]*
 // CHECK-DAG:   [[CPADDR2]] = bitcast i8** {{%[^,]+}} to i[[SZ]]*
-// CHECK-DAG:   store i64 4, i64* {{%[^,]+}}
+// CHECK-DAG:   store i64 {{4|8}}, i64* {{%[^,]+}}
 
-// CHECK-DAG:   store i[[SZ]] 2, i[[SZ]]* [[CBPADDR3:%.+]],
-// CHECK-DAG:   store i[[SZ]] 2, i[[SZ]]* [[CPADDR3:%.+]],
+// CHECK-DAG:   store i[[SZ]] %{{.+}}, i[[SZ]]* [[CBPADDR3:%.+]],
+// CHECK-DAG:   store i[[SZ]] %{{.+}}, i[[SZ]]* [[CPADDR3:%.+]],
 // CHECK-DAG:   [[CBPADDR3]] = bitcast i8** {{%[^,]+}} to i[[SZ]]*
 // CHECK-DAG:   [[CPADDR3]] = bitcast i8** {{%[^,]+}} to i[[SZ]]*
 // CHECK-DAG:   store i64 {{4|8}}, i64* {{%[^,]+}}
 
-// CHECK-DAG:   store i[[SZ]] %{{.+}}, i[[SZ]]* [[CBPADDR4:%.+]],
-// CHECK-DAG:   store i[[SZ]] %{{.+}}, i[[SZ]]* [[CPADDR4:%.+]],
-// CHECK-DAG:   [[CBPADDR4]] = bitcast i8** {{%[^,]+}} to i[[SZ]]*
-// CHECK-DAG:   [[CPADDR4]] = bitcast i8** {{%[^,]+}} to i[[SZ]]*
-// CHECK-DAG:   store i64 {{4|8}}, i64* {{%[^,]+}}
-
-// CHECK-DAG:   store i16* %{{.+}}, i16** [[CBPADDR5:%.+]],
-// CHECK-DAG:   store i16* %{{.+}}, i16** [[CPADDR5:%.+]],
-// CHECK-DAG:   [[CBPADDR5]] = bitcast i8** {{%[^,]+}} to i16**
-// CHECK-DAG:   [[CPADDR5]] = bitcast i8** {{%[^,]+}} to i16**
+// CHECK-DAG:   store i16* %{{.+}}, i16** [[CBPADDR4:%.+]],
+// CHECK-DAG:   store i16* %{{.+}}, i16** [[CPADDR4:%.+]],
+// CHECK-DAG:   [[CBPADDR4]] = bitcast i8** {{%[^,]+}} to i16**
+// CHECK-DAG:   [[CPADDR4]] = bitcast i8** {{%[^,]+}} to i16**
 // CHECK-DAG:   store i64 [[CSIZE]], i64* {{%[^,]+}}
 
-// OMP50-DAG:   store i[[SZ]] [[SIMD_COND]], i[[SZ]]* [[CBPADDR6:%.+]],
-// OMP50-DAG:   store i[[SZ]] [[SIMD_COND]], i[[SZ]]* [[CPADDR6:%.+]],
-// OMP50-DAG:   [[CBPADDR6]] = bitcast i8** {{%[^,]+}} to i[[SZ]]*
-// OMP50-DAG:   [[CPADDR6]] = bitcast i8** {{%[^,]+}} to i[[SZ]]*
+// OMP50-DAG:   store i[[SZ]] [[SIMD_COND]], i[[SZ]]* [[CBPADDR5:%.+]],
+// OMP50-DAG:   store i[[SZ]] [[SIMD_COND]], i[[SZ]]* [[CPADDR5:%.+]],
+// OMP50-DAG:   [[CBPADDR5]] = bitcast i8** {{%[^,]+}} to i[[SZ]]*
+// OMP50-DAG:   [[CPADDR5]] = bitcast i8** {{%[^,]+}} to i[[SZ]]*
 // OMP50-DAG:   store i64 1, i64* {{%[^,]+}}
 
 // CHECK-NEXT:  [[ERROR:%.+]] = icmp ne i32 [[RET]], 0
@@ -738,6 +730,10 @@
 // OMP50-DAG:   [[CONV_COND:%.+]] = bitcast i[[SZ]]* [[LOCAL_SIMD_COND_CASTED]] to i8*
 // OMP50-DAG:   [[SIMD_COND:%.+]] = load i8, i8* [[CONV_COND]],
 // OMP50-DAG:   trunc i8 [[SIMD_COND]] to i1
+// OMP45-NOT:   !nontemporal
+// OMP50:       store double {{.*}}!nontemporal
+// OMP50:       load double, {{.*}}!nontemporal
+// OMP50:       store double {{.*}}!nontemporal
 
 // CHECK:       define internal void [[HVT6]]
 // Create local storage for each capture.
Index: clang/test/OpenMP/target_parallel_for_simd_codegen.cpp
===================================================================
--- clang/test/OpenMP/target_parallel_for_simd_codegen.cpp
+++ clang/test/OpenMP/target_parallel_for_simd_codegen.cpp
@@ -539,7 +539,7 @@
     short int c[2][n];
 
 #ifdef OMP5
-    #pragma omp target parallel for simd if(n>60)
+    #pragma omp target parallel for simd if(n>60) nontemporal(a)
 #else
     #pragma omp target parallel for simd if(target: n>60)
 #endif // OMP5
@@ -837,6 +837,9 @@
 // OMP45:       define internal {{.*}}void [[OMP_OUTLINED5]](i32* noalias %.global_tid., i32* noalias %.bound_tid., [[S1]]* %{{.+}}, i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, i16* {{.+}})
 // OMP50:       define internal {{.*}}void [[OMP_OUTLINED5]](i32* noalias %.global_tid., i32* noalias %.bound_tid., [[S1]]* %{{.+}}, i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, i16* {{.+}}, i[[SZ]] %{{.+}})
 // To reduce complexity, we're only going as far as validating the signature of the outlined parallel function.
+// OMP45-NOT:   !nontemporal
+// OMP50:       store double{{.*}}!nontemporal
+// OMP50:       load double{{.*}}!nontemporal
 
 
 // CHECK:       define internal void [[HVT6]]
Index: clang/test/OpenMP/simd_codegen.cpp
===================================================================
--- clang/test/OpenMP/simd_codegen.cpp
+++ clang/test/OpenMP/simd_codegen.cpp
@@ -22,10 +22,15 @@
 long long get_val() { return 0; }
 double *g_ptr;
 
+struct S {
+  int a, b;
+};
+
 // CHECK-LABEL: define {{.*void}} @{{.*}}simple{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}})
 void simple(float *a, float *b, float *c, float *d) {
+  S s;
 #ifdef OMP5
-  #pragma omp simd if (simd: true)
+  #pragma omp simd if (simd: true) nontemporal(a, b, c, d, s)
 #else
   #pragma omp simd
 #endif
@@ -43,8 +48,16 @@
 // CHECK-NEXT: store i32 [[CALC_I_2]], i32* [[LC_I:.+]]{{.*}}!llvm.access.group
 // ... loop body ...
 // End of body: store into a[i]:
+// OMP45-NOT: load float*,{{.*}}!nontemporal
+// CHECK-NOT: load float,{{.*}}!nontemporal
+// OMP50: load float*,{{.*}}!nontemporal
+// OMP50: load float*,{{.*}}!nontemporal
+// OMP50: load float*,{{.*}}!nontemporal
+// OMP50: load i32,{{.*}}!nontemporal
+// OMP50: load float*,{{.*}}!nontemporal
+// CHECK-NOT: load float,{{.*}}!nontemporal
 // CHECK: store float [[RESULT:%.+]], float* {{%.+}}{{.*}}!llvm.access.group
-    a[i] = b[i] * c[i] * d[i];
+    a[i] = b[i] * c[i] * d[i] + s.a;
 // CHECK: [[IV1_2:%.+]] = load i32, i32* [[OMP_IV]]{{.*}}!llvm.access.group
 // CHECK-NEXT: [[ADD1_2:%.+]] = add nsw i32 [[IV1_2]], 1
 // CHECK-NEXT: store i32 [[ADD1_2]], i32* [[OMP_IV]]{{.*}}!llvm.access.group
Index: clang/test/OpenMP/for_simd_codegen.cpp
===================================================================
--- clang/test/OpenMP/for_simd_codegen.cpp
+++ clang/test/OpenMP/for_simd_codegen.cpp
@@ -333,7 +333,7 @@
 // OMP50: br i1 [[COND]], label {{%?}}[[THEN:[^,]+]], label {{%?}}[[ELSE:[^,]+]]
 // OMP50: [[THEN]]:
 #ifdef OMP5
-  #pragma omp for simd reduction(*:R) if (simd:A)
+  #pragma omp for simd reduction(*:R) if (simd:A) nontemporal(R)
 #else
   #pragma omp for simd reduction(*:R)
 #endif
@@ -366,7 +366,8 @@
 // CHECK-NEXT: [[LC_IT_2:%.+]] = add nsw i64 -10, [[LC_IT_1]]
 // CHECK-NEXT: store i64 [[LC_IT_2]], i64* [[LC:%[^,]+]],
 // CHECK-NEXT: [[LC_VAL:%.+]] = load i64, i64* [[LC]]
-// CHECK: store i32 %{{.+}}, i32* [[R_PRIV]],
+// OMP45: store i32 %{{.+}}, i32* [[R_PRIV]],
+// OMP50: store i32 %{{.+}}, i32* [[R_PRIV]],{{.*}}!nontemporal
     R *= i;
 // CHECK: [[IV8_2:%.+]] = load i64, i64* [[OMP_IV8]]
 // CHECK-NEXT: [[ADD8_2:%.+]] = add nsw i64 [[IV8_2]], 1
Index: clang/test/OpenMP/distribute_simd_codegen.cpp
===================================================================
--- clang/test/OpenMP/distribute_simd_codegen.cpp
+++ clang/test/OpenMP/distribute_simd_codegen.cpp
@@ -143,7 +143,7 @@
   #pragma omp target
   #pragma omp teams
 #ifdef OMP5
-  #pragma omp distribute simd dist_schedule(static) safelen(32) if(simd: true)
+  #pragma omp distribute simd dist_schedule(static) safelen(32) if(simd: true) nontemporal(a, b)
 #else
   #pragma omp distribute simd dist_schedule(static) safelen(32)
 #endif // OMP5
@@ -189,6 +189,11 @@
 // CHECK:  [[BBINNBODY]]:
 // CHECK:  {{.+}} = load i32, i32* [[IV]]
 // ... loop body ...
+// OMP45-NOT: !nontemporal
+// OMP50:  load float*,{{.*}}!nontemporal
+// OMP50:  load float*,{{.*}}!nontemporal
+// OMP50-NOT: !nontemporal
+
 // CHECK:  br label %[[BBBODYCONT:.+]]
 // CHECK:  [[BBBODYCONT]]:
 // CHECK:  br label %[[BBINNINC:.+]]
@@ -271,7 +276,7 @@
   #pragma omp target
   #pragma omp teams
 #ifdef OMP5
-  #pragma omp distribute simd linear(i) if(a)
+  #pragma omp distribute simd linear(i) if(a) nontemporal(i)
 #else
   #pragma omp distribute simd linear(i)
 #endif // OMP5
@@ -293,6 +298,9 @@
 // CHECK:  br i1 [[ACMP]], label %[[PRECOND_THEN:.+]], label %[[PRECOND_END:.+]]
 // CHECK:  [[PRECOND_THEN]]
 // CHECK:  call void @__kmpc_for_static_init_4
+// OMP45-NOT: !nontemporal
+// OMP50:  store i8 {{.*}}!nontemporal
+// OMP50-NOT: !nontemporal
 // CHECK:  call void @__kmpc_for_static_fini
 // CHECK:  [[PRECOND_END]]
 
Index: clang/lib/Serialization/ASTWriter.cpp
===================================================================
--- clang/lib/Serialization/ASTWriter.cpp
+++ clang/lib/Serialization/ASTWriter.cpp
@@ -6544,4 +6544,6 @@
   Record.AddSourceLocation(C->getLParenLoc());
   for (auto *VE : C->varlists())
     Record.AddStmt(VE);
+  for (auto *E : C->private_refs())
+    Record.AddStmt(E);
 }
Index: clang/lib/Serialization/ASTReader.cpp
===================================================================
--- clang/lib/Serialization/ASTReader.cpp
+++ clang/lib/Serialization/ASTReader.cpp
@@ -12466,4 +12466,9 @@
   for (unsigned i = 0; i != NumVars; ++i)
     Vars.push_back(Record.readSubExpr());
   C->setVarRefs(Vars);
+  Vars.clear();
+  Vars.reserve(NumVars);
+  for (unsigned i = 0; i != NumVars; ++i)
+    Vars.push_back(Record.readSubExpr());
+  C->setPrivateRefs(Vars);
 }
Index: clang/lib/Sema/SemaOpenMP.cpp
===================================================================
--- clang/lib/Sema/SemaOpenMP.cpp
+++ clang/lib/Sema/SemaOpenMP.cpp
@@ -2230,6 +2230,11 @@
 
 static void checkAllocateClauses(Sema &S, DSAStackTy *Stack,
                                  ArrayRef<OMPClause *> Clauses);
+static std::pair<ValueDecl *, bool>
+getPrivateItem(Sema &S, Expr *&RefExpr, SourceLocation &ELoc,
+               SourceRange &ERange, bool AllowArraySection = false);
+static DeclRefExpr *buildCapture(Sema &S, ValueDecl *D, Expr *CaptureExpr,
+                                 bool WithInit);
 
 void Sema::EndOpenMPDSABlock(Stmt *CurDirective) {
   // OpenMP [2.14.3.5, Restrictions, C/C++, p.1]
@@ -2274,6 +2279,31 @@
           }
         }
         Clause->setPrivateCopies(PrivateCopies);
+        continue;
+      }
+      // Finalize nontemporal clause by handling private copies, if any.
+      if (auto *Clause = dyn_cast<OMPNontemporalClause>(C)) {
+        SmallVector<Expr *, 8> PrivateRefs;
+        for (Expr *RefExpr : Clause->varlists()) {
+          assert(RefExpr && "NULL expr in OpenMP nontemporal clause.");
+          SourceLocation ELoc;
+          SourceRange ERange;
+          Expr *SimpleRefExpr = RefExpr;
+          auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange);
+          if (Res.second)
+            // It will be analyzed later.
+            PrivateRefs.push_back(nullptr);
+          ValueDecl *D = Res.first;
+          if (!D)
+            continue;
+
+          const DSAStackTy::DSAVarData DVar =
+              DSAStack->getTopDSA(D, /*FromParent=*/false);
+          PrivateRefs.push_back(DVar.PrivateCopy ? DVar.PrivateCopy
+                                                 : SimpleRefExpr);
+        }
+        Clause->setPrivateRefs(PrivateRefs);
+        continue;
       }
     }
     // Check allocate clauses.
@@ -4262,9 +4292,10 @@
   return ErrorFound;
 }
 
-static std::pair<ValueDecl *, bool>
-getPrivateItem(Sema &S, Expr *&RefExpr, SourceLocation &ELoc,
-               SourceRange &ERange, bool AllowArraySection = false) {
+static std::pair<ValueDecl *, bool> getPrivateItem(Sema &S, Expr *&RefExpr,
+                                                   SourceLocation &ELoc,
+                                                   SourceRange &ERange,
+                                                   bool AllowArraySection) {
   if (RefExpr->isTypeDependent() || RefExpr->isValueDependent() ||
       RefExpr->containsUnexpandedParameterPack())
     return std::make_pair(nullptr, true);
@@ -17172,8 +17203,6 @@
     if (!D)
       continue;
 
-    auto *VD = dyn_cast<VarDecl>(D);
-
     // OpenMP 5.0, 2.9.3.1 simd Construct, Restrictions.
     // A list-item cannot appear in more than one nontemporal clause.
     if (const Expr *PrevRef =
@@ -17185,12 +17214,7 @@
       continue;
     }
 
-    DeclRefExpr *Ref = nullptr;
-    if (!VD && isOpenMPCapturedDecl(D) && !CurContext->isDependentContext())
-      Ref = buildCapture(*this, D, SimpleRefExpr, /*WithInit=*/true);
-    Vars.push_back((VD || !Ref || CurContext->isDependentContext())
-                       ? RefExpr->IgnoreParens()
-                       : Ref);
+    Vars.push_back(RefExpr);
   }
 
   if (Vars.empty())
Index: clang/lib/CodeGen/CGStmtOpenMP.cpp
===================================================================
--- clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -1803,8 +1803,9 @@
 static void emitCommonSimdLoop(CodeGenFunction &CGF, const OMPLoopDirective &S,
                                const RegionCodeGenTy &SimdInitGen,
                                const RegionCodeGenTy &BodyCodeGen) {
-  auto &&ThenGen = [&SimdInitGen, &BodyCodeGen](CodeGenFunction &CGF,
-                                                PrePostActionTy &) {
+  auto &&ThenGen = [&S, &SimdInitGen, &BodyCodeGen](CodeGenFunction &CGF,
+                                                    PrePostActionTy &) {
+    CGOpenMPRuntime::NontemporalDeclsRAII NontemporalsRegion(CGF.CGM, S);
     CodeGenFunction::OMPLocalDeclMapRAII Scope(CGF);
     SimdInitGen(CGF);
 
Index: clang/lib/CodeGen/CGOpenMPRuntime.h
===================================================================
--- clang/lib/CodeGen/CGOpenMPRuntime.h
+++ clang/lib/CodeGen/CGOpenMPRuntime.h
@@ -211,6 +211,15 @@
     ~DisableAutoDeclareTargetRAII();
   };
 
+  /// Manages list of nontemporal decls for the specified directive.
+  class NontemporalDeclsRAII {
+    CodeGenModule &CGM;
+
+  public:
+    NontemporalDeclsRAII(CodeGenModule &CGM, const OMPLoopDirective &S);
+    ~NontemporalDeclsRAII();
+  };
+
 protected:
   CodeGenModule &CGM;
   StringRef FirstSeparator, Separator;
@@ -650,6 +659,10 @@
                   std::pair<GlobalDecl, GlobalDecl>>
       DeferredVariantFunction;
 
+  using NontemporalDeclsSet = llvm::SmallDenseSet<CanonicalDeclPtr<const Decl>>;
+  /// Stack for list of declaration in current context marked as nontemporal.
+  llvm::SmallVector<NontemporalDeclsSet, 4> NontemporalDeclsStack;
+
   /// Flag for keeping track of weather a requires unified_shared_memory
   /// directive is present.
   bool HasRequiresUnifiedSharedMemory = false;
@@ -1663,6 +1676,10 @@
 
   /// Emits the definition of the declare variant function.
   virtual bool emitDeclareVariant(GlobalDecl GD, bool IsForDefinition);
+
+  /// Checks if the \p VD variable is marked as nontemporal declaration in
+  /// current context.
+  bool isNontemporalDecl(const ValueDecl *VD) const;
 };
 
 /// Class supports emissionof SIMD-only code.
Index: clang/lib/CodeGen/CGOpenMPRuntime.cpp
===================================================================
--- clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -11341,6 +11341,43 @@
   return true;
 }
 
+CGOpenMPRuntime::NontemporalDeclsRAII::NontemporalDeclsRAII(
+    CodeGenModule &CGM, const OMPLoopDirective &S)
+    : CGM(CGM) {
+  assert(CGM.getLangOpts().OpenMP && "Not in OpenMP mode.");
+  NontemporalDeclsSet &DS =
+      CGM.getOpenMPRuntime().NontemporalDeclsStack.emplace_back();
+  // No need to check for nontemporal clauses in non-simd directives.
+  if (!isOpenMPSimdDirective(S.getDirectiveKind()))
+      return;
+  for (const auto *C : S.getClausesOfKind<OMPNontemporalClause>()) {
+    for (const Stmt *Ref : C->private_refs()) {
+      const auto *SimpleRefExpr = cast<Expr>(Ref)->IgnoreParenImpCasts();
+      const ValueDecl *VD;
+      if (const auto *DRE = dyn_cast<DeclRefExpr>(SimpleRefExpr)) {
+        VD = DRE->getDecl();
+      } else {
+        const auto *ME = cast<MemberExpr>(SimpleRefExpr);
+        assert((ME->isImplicitCXXThis() ||
+                isa<CXXThisExpr>(ME->getBase()->IgnoreParenImpCasts())) &&
+               "Expected member of current class.");
+        VD = ME->getMemberDecl();
+      }
+      DS.insert(VD);
+    }
+  }
+}
+
+CGOpenMPRuntime::NontemporalDeclsRAII::~NontemporalDeclsRAII() {
+  CGM.getOpenMPRuntime().NontemporalDeclsStack.pop_back();
+}
+
+bool CGOpenMPRuntime::isNontemporalDecl(const ValueDecl *VD) const {
+  assert(CGM.getLangOpts().OpenMP && "Not in OpenMP mode.");
+  return !CGM.getOpenMPRuntime().NontemporalDeclsStack.empty() &&
+         CGM.getOpenMPRuntime().NontemporalDeclsStack.back().count(VD) > 0;
+}
+
 llvm::Function *CGOpenMPSIMDRuntime::emitParallelOutlinedFunction(
     const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
     OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
Index: clang/lib/CodeGen/CGExpr.cpp
===================================================================
--- clang/lib/CodeGen/CGExpr.cpp
+++ clang/lib/CodeGen/CGExpr.cpp
@@ -2566,21 +2566,35 @@
       VD = VD->getCanonicalDecl();
       if (auto *FD = LambdaCaptureFields.lookup(VD))
         return EmitCapturedFieldLValue(*this, FD, CXXABIThisValue);
-      else if (CapturedStmtInfo) {
+      if (CapturedStmtInfo) {
         auto I = LocalDeclMap.find(VD);
         if (I != LocalDeclMap.end()) {
+          LValue CapLVal;
           if (VD->getType()->isReferenceType())
-            return EmitLoadOfReferenceLValue(I->second, VD->getType(),
-                                             AlignmentSource::Decl);
-          return MakeAddrLValue(I->second, T);
+            CapLVal = EmitLoadOfReferenceLValue(I->second, VD->getType(),
+                                                AlignmentSource::Decl);
+          else
+            CapLVal = MakeAddrLValue(I->second, T);
+          // Mark lvalue as nontemporal if the variable is marked as nontemporal
+          // in simd context.
+          if (getLangOpts().OpenMP &&
+              CGM.getOpenMPRuntime().isNontemporalDecl(VD))
+            CapLVal.setNontemporal(/*Value=*/true);
+          return CapLVal;
         }
         LValue CapLVal =
             EmitCapturedFieldLValue(*this, CapturedStmtInfo->lookup(VD),
                                     CapturedStmtInfo->getContextValue());
-        return MakeAddrLValue(
+        CapLVal = MakeAddrLValue(
             Address(CapLVal.getPointer(*this), getContext().getDeclAlign(VD)),
             CapLVal.getType(), LValueBaseInfo(AlignmentSource::Decl),
             CapLVal.getTBAAInfo());
+        // Mark lvalue as nontemporal if the variable is marked as nontemporal
+        // in simd context.
+        if (getLangOpts().OpenMP &&
+            CGM.getOpenMPRuntime().isNontemporalDecl(VD))
+          CapLVal.setNontemporal(/*Value=*/true);
+        return CapLVal;
       }
 
       assert(isa<BlockDecl>(CurCodeDecl));
@@ -3929,6 +3943,15 @@
   if (auto *Field = dyn_cast<FieldDecl>(ND)) {
     LValue LV = EmitLValueForField(BaseLV, Field);
     setObjCGCLValueClass(getContext(), E, LV);
+    if (getLangOpts().OpenMP) {
+      // If the member was explicitly marked as nontemporal, mark it as
+      // nontemporal. If the base lvalue is marked as nontemporal, mark access
+      // to children as nontemporal too.
+      if ((IsWrappedCXXThis(BaseExpr) &&
+           CGM.getOpenMPRuntime().isNontemporalDecl(Field)) ||
+          (!E->isArrow() && BaseLV.isNontemporal()))
+        LV.setNontemporal(/*Value=*/true);
+    }
     return LV;
   }
 
Index: clang/lib/AST/StmtProfile.cpp
===================================================================
--- clang/lib/AST/StmtProfile.cpp
+++ clang/lib/AST/StmtProfile.cpp
@@ -769,8 +769,13 @@
     const OMPIsDevicePtrClause *C) {
   VisitOMPClauseList(C);
 }
-void OMPClauseProfiler::VisitOMPNontemporalClause(const OMPNontemporalClause *C) {
+void OMPClauseProfiler::VisitOMPNontemporalClause(
+    const OMPNontemporalClause *C) {
   VisitOMPClauseList(C);
+  for (auto *E : C->private_refs()) {
+    if (E)
+      Profiler->VisitStmt(E);
+  }
 }
 }
 
Index: clang/lib/AST/OpenMPClause.cpp
===================================================================
--- clang/lib/AST/OpenMPClause.cpp
+++ clang/lib/AST/OpenMPClause.cpp
@@ -1162,8 +1162,8 @@
                                                    SourceLocation LParenLoc,
                                                    SourceLocation EndLoc,
                                                    ArrayRef<Expr *> VL) {
-  // Allocate space for nontemporal variables.
-  void *Mem = C.Allocate(totalSizeToAlloc<Expr *>(VL.size()));
+  // Allocate space for nontemporal variables + private references.
+  void *Mem = C.Allocate(totalSizeToAlloc<Expr *>(2 * VL.size()));
   auto *Clause =
       new (Mem) OMPNontemporalClause(StartLoc, LParenLoc, EndLoc, VL.size());
   Clause->setVarRefs(VL);
@@ -1172,10 +1172,16 @@
 
 OMPNontemporalClause *OMPNontemporalClause::CreateEmpty(const ASTContext &C,
                                                         unsigned N) {
-  void *Mem = C.Allocate(totalSizeToAlloc<Expr *>(N));
+  void *Mem = C.Allocate(totalSizeToAlloc<Expr *>(2 * N));
   return new (Mem) OMPNontemporalClause(N);
 }
 
+void OMPNontemporalClause::setPrivateRefs(ArrayRef<Expr *> VL) {
+  assert(VL.size() == varlist_size() && "Number of private references is not "
+                                        "the same as the preallocated buffer");
+  std::copy(VL.begin(), VL.end(), varlist_end());
+}
+
 //===----------------------------------------------------------------------===//
 //  OpenMP clauses printing methods
 //===----------------------------------------------------------------------===//
Index: clang/include/clang/AST/RecursiveASTVisitor.h
===================================================================
--- clang/include/clang/AST/RecursiveASTVisitor.h
+++ clang/include/clang/AST/RecursiveASTVisitor.h
@@ -3378,6 +3378,9 @@
 bool RecursiveASTVisitor<Derived>::VisitOMPNontemporalClause(
     OMPNontemporalClause *C) {
   TRY_TO(VisitOMPClauseList(C));
+  for (auto *E : C->private_refs()) {
+    TRY_TO(TraverseStmt(E));
+  }
   return true;
 }
 
Index: clang/include/clang/AST/OpenMPClause.h
===================================================================
--- clang/include/clang/AST/OpenMPClause.h
+++ clang/include/clang/AST/OpenMPClause.h
@@ -6275,6 +6275,15 @@
             OMPC_nontemporal, SourceLocation(), SourceLocation(),
             SourceLocation(), N) {}
 
+  /// Get the list of privatied copies if the member expression was captured by
+  /// one of the privatization clauses.
+  MutableArrayRef<Expr *> getPrivateRefs() {
+    return MutableArrayRef<Expr *>(varlist_end(), varlist_size());
+  }
+  ArrayRef<const Expr *> getPrivateRefs() const {
+    return llvm::makeArrayRef(varlist_end(), varlist_size());
+  }
+
 public:
   /// Creates clause with a list of variables \a VL.
   ///
@@ -6293,6 +6302,10 @@
   /// \param N The number of variables.
   static OMPNontemporalClause *CreateEmpty(const ASTContext &C, unsigned N);
 
+  /// Sets the list of references to private copies created in private clauses.
+  /// \param VL List of references.
+  void setPrivateRefs(ArrayRef<Expr *> VL);
+
   child_range children() {
     return child_range(reinterpret_cast<Stmt **>(varlist_begin()),
                        reinterpret_cast<Stmt **>(varlist_end()));
@@ -6303,6 +6316,16 @@
     return const_child_range(Children.begin(), Children.end());
   }
 
+  child_range private_refs() {
+    return child_range(reinterpret_cast<Stmt **>(getPrivateRefs().begin()),
+                       reinterpret_cast<Stmt **>(getPrivateRefs().end()));
+  }
+
+  const_child_range private_refs() const {
+    auto Children = const_cast<OMPNontemporalClause *>(this)->private_refs();
+    return const_child_range(Children.begin(), Children.end());
+  }
+
   child_range used_children() {
     return child_range(child_iterator(), child_iterator());
   }
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to