Meinersbur created this revision.
Meinersbur added reviewers: jdoerfert, AMDChirag, anchu-rajendran, 
kiranchandramohan, SouraVX, ftynse, kiranktp, fghanim, ABataev.
Meinersbur added projects: OpenMP, clang.
Herald added subscribers: dexonsmith, rriddle, arphaman, guansong, hiraditya, 
yaxunl.
Meinersbur requested review of this revision.
Herald added subscribers: llvm-commits, sstefan1, stephenneuendorffer.
Herald added a project: LLVM.

Initial support for using the OpenMPIRBuilder by clang to generate loops using 
the OpenMPIRBuilder. This initial support is intentionally limited to:

- Only the worksharing-loop directive.
- Recognizes only the nowait clause.
- No loop nests with more than one loop.
- Untested with templates, exceptions.
- Semantic checking left to the existing infrastructure.

This patch introduces a new AST node, OMPCanonicalLoop, which becomes parent of 
any loop that has to adheres to the restrictions as specified by the OpenMP 
standard. These restrictions allow OMPCanonicalLoop to provide the following 
additional information that depends on base language semantics:

- The distance function: How many loop iterations there will be before entering 
the loop nest.
- The loop variable function: Conversion from a logical iteration number to the 
loop variable.

These allow the OpenMPIRBuilder to act solely using logical iteration numbers 
without needing to be concerned with iterator semantics between calling the 
distance function and determining what the value of the loop variable ought to 
be. Any OpenMP logical should be done by the OpenMPIRBuilder such that it can 
be reused MLIR OpenMP dialect and thus by flang.

The distance and loop variable function are implemented using lambdas (or more 
exactly: CapturedStmt because lambda implementation is more interviewed with 
the parser). It is up to the OpenMPIRBuilder how they are called which depends 
on what is done with the loop. By default, these are emitted as outlined 
functions but we might think about emitting them inline as the OpenMPRuntime 
does.

For compatibility with the current OpenMP implementation, even though not 
necessary for the OpenMPIRBuilder, OMPCanonicalLoop can still be nested within 
OMPLoopDirectives' CapturedStmt. Although OMPCanonicalLoop's are not currently 
generated when the OpenMPIRBuilder is not enabled, these can just be skipped 
when not using the OpenMPIRBuilder in case we don't want to make the AST 
dependent on the EnableOMPBuilder setting.

Loop nests with more than one loop require support by the OpenMPIRBuilder 
(D93268 <https://reviews.llvm.org/D93268>). A simple implementation of 
non-rectangular loop nests would add another lambda function that returns 
whether a loop iteration of the rectangular overapproximation is also within 
its non-rectangular subset.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D94973

Files:
  clang/include/clang-c/Index.h
  clang/include/clang/AST/RecursiveASTVisitor.h
  clang/include/clang/AST/StmtOpenMP.h
  clang/include/clang/Basic/StmtNodes.td
  clang/include/clang/Sema/Sema.h
  clang/include/clang/Serialization/ASTBitCodes.h
  clang/lib/AST/Stmt.cpp
  clang/lib/AST/StmtOpenMP.cpp
  clang/lib/AST/StmtPrinter.cpp
  clang/lib/AST/StmtProfile.cpp
  clang/lib/CodeGen/CGStmt.cpp
  clang/lib/CodeGen/CGStmtOpenMP.cpp
  clang/lib/CodeGen/CodeGenFunction.h
  clang/lib/Parse/ParseOpenMP.cpp
  clang/lib/Sema/SemaExpr.cpp
  clang/lib/Sema/SemaOpenMP.cpp
  clang/lib/Sema/TreeTransform.h
  clang/lib/Serialization/ASTReaderStmt.cpp
  clang/lib/Serialization/ASTWriterStmt.cpp
  clang/test/OpenMP/irbuilder_for_iterator.cpp
  clang/test/OpenMP/irbuilder_for_rangefor.cpp
  clang/test/OpenMP/irbuilder_for_unsigned.c
  clang/tools/libclang/CIndex.cpp
  llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
  llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp

Index: llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
===================================================================
--- llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -972,7 +972,8 @@
 
   // Emit the body content. We do it after connecting the loop to the CFG to
   // avoid that the callback encounters degenerate BBs.
-  BodyGenCB(CL->getBodyIP(), CL->getIndVar());
+  if (BodyGenCB)
+    BodyGenCB(CL->getBodyIP(), CL->getIndVar());
 
 #ifndef NDEBUG
   CL->assertOK();
@@ -1165,6 +1166,14 @@
   return CLI;
 }
 
+CanonicalLoopInfo *OpenMPIRBuilder::createWorkshareLoop(const LocationDescription &Loc,
+                                                        CanonicalLoopInfo *CLI,
+                                                        InsertPointTy AllocaIP,
+                                                        bool NeedsBarrier) {
+  // Currently only supports static schedules.
+  return createStaticWorkshareLoop(Loc, CLI, AllocaIP, NeedsBarrier);
+}
+
 OpenMPIRBuilder::InsertPointTy
 OpenMPIRBuilder::createCopyPrivate(const LocationDescription &Loc,
                                    llvm::Value *BufSize, llvm::Value *CpyBuf,
Index: llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
===================================================================
--- llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -300,6 +300,12 @@
                                                bool NeedsBarrier,
                                                Value *Chunk = nullptr);
 
+  /// Modifies the canonical loop to be a workshare loop.
+  CanonicalLoopInfo *createWorkshareLoop(const LocationDescription &Loc,
+                                         CanonicalLoopInfo *CLI,
+                                         InsertPointTy AllocaIP,
+                                         bool NeedsBarrier);
+
   /// Generator for '#omp flush'
   ///
   /// \param Loc The location where the flush directive was encountered
@@ -729,6 +735,12 @@
   BasicBlock *Exit;
   BasicBlock *After;
 
+  /// Add the control blocks of this loop to \p BBs.
+  ///
+  /// This does not include any block from the body, including the one returned
+  /// by getBody().
+  void colleckControlBlocks(SmallVectorImpl<BasicBlock *> &BBs);
+
 public:
   /// The preheader ensures that there is only a single edge entering the loop.
   /// Code that must be execute before any loop iteration can be emitted here,
@@ -781,6 +793,14 @@
     return IndVarPHI;
   }
 
+  /// Return the type of the induction variable (and the trip count).
+  Type *getIndVarType() const { return getIndVar()->getType(); }
+
+  /// Return the insertion point for user code before the loop.
+  OpenMPIRBuilder::InsertPointTy getPreheaderIP() const {
+    return {Preheader, std::prev(Preheader->end())};
+  };
+
   /// Return the insertion point for user code in the body.
   OpenMPIRBuilder::InsertPointTy getBodyIP() const {
     return {Body, Body->begin()};
@@ -791,6 +811,8 @@
     return {After, After->begin()};
   };
 
+  Function *getFunction() const { return Header->getParent(); }
+
   /// Consistency self-check.
   void assertOK() const;
 };
Index: clang/tools/libclang/CIndex.cpp
===================================================================
--- clang/tools/libclang/CIndex.cpp
+++ clang/tools/libclang/CIndex.cpp
@@ -2042,6 +2042,7 @@
   void VisitPseudoObjectExpr(const PseudoObjectExpr *E);
   void VisitOpaqueValueExpr(const OpaqueValueExpr *E);
   void VisitLambdaExpr(const LambdaExpr *E);
+  void VisitOMPCanonicalLoop(const OMPCanonicalLoop *L);
   void VisitOMPExecutableDirective(const OMPExecutableDirective *D);
   void VisitOMPLoopDirective(const OMPLoopDirective *D);
   void VisitOMPParallelDirective(const OMPParallelDirective *D);
@@ -2838,6 +2839,11 @@
   Visit(E->getSyntacticForm());
 }
 
+void EnqueueVisitor::VisitOMPCanonicalLoop(const OMPCanonicalLoop *L) {
+  VisitStmt(L);
+  EnqueueChildren(L);
+}
+
 void EnqueueVisitor::VisitOMPExecutableDirective(
     const OMPExecutableDirective *D) {
   EnqueueChildren(D);
@@ -5656,6 +5662,8 @@
     return cxstring::createRef("attribute(warn_unused_result)");
   case CXCursor_AlignedAttr:
     return cxstring::createRef("attribute(aligned)");
+  case CXCursor_OMPCanonicalLoop:
+    return cxstring::createRef("OMPCanonicalLoop");
   }
 
   llvm_unreachable("Unhandled CXCursorKind");
Index: clang/test/OpenMP/irbuilder_for_unsigned.c
===================================================================
--- /dev/null
+++ clang/test/OpenMP/irbuilder_for_unsigned.c
@@ -0,0 +1,147 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs
+// RUN: %clang_cc1 -fopenmp-enable-irbuilder -verify -fopenmp -fopenmp-version=45 -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+extern "C" void workshareloop_unsigned(float *a, float *b, float *c, float *d) {
+#pragma omp for
+  for (unsigned i = 33; i < 32000000; i += 7) {
+    a[i] = b[i] * c[i] * d[i];
+  }
+}
+
+#endif // HEADER
+// CHECK-LABEL: define {{[^@]+}}@workshareloop_unsigned
+// CHECK-SAME: (float* [[A:%.*]], float* [[B:%.*]], float* [[C:%.*]], float* [[D:%.*]]) [[ATTR0:#.*]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca float*, align 8
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca float*, align 8
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca float*, align 8
+// CHECK-NEXT:    [[D_ADDR:%.*]] = alloca float*, align 8
+// CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON:%.*]], align 8
+// CHECK-NEXT:    [[DOTCOUNT_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[AGG_CAPTURED1:%.*]] = alloca [[STRUCT_ANON_0:%.*]], align 4
+// CHECK-NEXT:    [[P_LASTITER:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[P_LOWERBOUND:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[P_UPPERBOUND:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[P_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store float* [[A]], float** [[A_ADDR]], align 8
+// CHECK-NEXT:    store float* [[B]], float** [[B_ADDR]], align 8
+// CHECK-NEXT:    store float* [[C]], float** [[C_ADDR]], align 8
+// CHECK-NEXT:    store float* [[D]], float** [[D_ADDR]], align 8
+// CHECK-NEXT:    store i32 33, i32* [[I]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[AGG_CAPTURED]], i32 0, i32 0
+// CHECK-NEXT:    store i32* [[I]], i32** [[TMP0]], align 8
+// CHECK-NEXT:    call void @__captured_stmt(i32* [[DOTCOUNT_ADDR]], %struct.anon* [[AGG_CAPTURED]])
+// CHECK-NEXT:    [[DOTCOUNT:%.*]] = load i32, i32* [[DOTCOUNT_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_ANON_0]], %struct.anon.0* [[AGG_CAPTURED1]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[I]], align 4
+// CHECK-NEXT:    store i32 [[TMP2]], i32* [[TMP1]], align 4
+// CHECK-NEXT:    br label [[OMP_LOOP_PREHEADER:%.*]]
+// CHECK:       omp_loop.preheader:
+// CHECK-NEXT:    store i32 0, i32* [[P_LOWERBOUND]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = sub i32 [[DOTCOUNT]], 1
+// CHECK-NEXT:    store i32 [[TMP3]], i32* [[P_UPPERBOUND]], align 4
+// CHECK-NEXT:    store i32 1, i32* [[P_STRIDE]], align 4
+// CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1:@.*]])
+// CHECK-NEXT:    call void @__kmpc_for_static_init_4u(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]], i32 34, i32* [[P_LASTITER]], i32* [[P_LOWERBOUND]], i32* [[P_UPPERBOUND]], i32* [[P_STRIDE]], i32 1, i32 1)
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[P_LOWERBOUND]], align 4
+// CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[P_UPPERBOUND]], align 4
+// CHECK-NEXT:    [[TMP6:%.*]] = sub i32 [[TMP5]], [[TMP4]]
+// CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP6]], 1
+// CHECK-NEXT:    br label [[OMP_LOOP_HEADER:%.*]]
+// CHECK:       omp_loop.header:
+// CHECK-NEXT:    [[OMP_LOOP_IV:%.*]] = phi i32 [ 0, [[OMP_LOOP_PREHEADER]] ], [ [[OMP_LOOP_NEXT:%.*]], [[OMP_LOOP_INC:%.*]] ]
+// CHECK-NEXT:    br label [[OMP_LOOP_COND:%.*]]
+// CHECK:       omp_loop.cond:
+// CHECK-NEXT:    [[OMP_LOOP_CMP:%.*]] = icmp ult i32 [[OMP_LOOP_IV]], [[TMP7]]
+// CHECK-NEXT:    br i1 [[OMP_LOOP_CMP]], label [[OMP_LOOP_BODY:%.*]], label [[OMP_LOOP_EXIT:%.*]]
+// CHECK:       omp_loop.body:
+// CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[OMP_LOOP_IV]], [[TMP4]]
+// CHECK-NEXT:    call void @__captured_stmt.1(i32* [[I]], i32 [[TMP8]], %struct.anon.0* [[AGG_CAPTURED1]])
+// CHECK-NEXT:    [[TMP9:%.*]] = load float*, float** [[B_ADDR]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = load i32, i32* [[I]], align 4
+// CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP10]] to i64
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[TMP9]], i64 [[IDXPROM]]
+// CHECK-NEXT:    [[TMP11:%.*]] = load float, float* [[ARRAYIDX]], align 4
+// CHECK-NEXT:    [[TMP12:%.*]] = load float*, float** [[C_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = load i32, i32* [[I]], align 4
+// CHECK-NEXT:    [[IDXPROM2:%.*]] = zext i32 [[TMP13]] to i64
+// CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[TMP12]], i64 [[IDXPROM2]]
+// CHECK-NEXT:    [[TMP14:%.*]] = load float, float* [[ARRAYIDX3]], align 4
+// CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP11]], [[TMP14]]
+// CHECK-NEXT:    [[TMP15:%.*]] = load float*, float** [[D_ADDR]], align 8
+// CHECK-NEXT:    [[TMP16:%.*]] = load i32, i32* [[I]], align 4
+// CHECK-NEXT:    [[IDXPROM4:%.*]] = zext i32 [[TMP16]] to i64
+// CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[TMP15]], i64 [[IDXPROM4]]
+// CHECK-NEXT:    [[TMP17:%.*]] = load float, float* [[ARRAYIDX5]], align 4
+// CHECK-NEXT:    [[MUL6:%.*]] = fmul float [[MUL]], [[TMP17]]
+// CHECK-NEXT:    [[TMP18:%.*]] = load float*, float** [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP19:%.*]] = load i32, i32* [[I]], align 4
+// CHECK-NEXT:    [[IDXPROM7:%.*]] = zext i32 [[TMP19]] to i64
+// CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[TMP18]], i64 [[IDXPROM7]]
+// CHECK-NEXT:    store float [[MUL6]], float* [[ARRAYIDX8]], align 4
+// CHECK-NEXT:    br label [[OMP_LOOP_INC]]
+// CHECK:       omp_loop.inc:
+// CHECK-NEXT:    [[OMP_LOOP_NEXT]] = add nuw i32 [[OMP_LOOP_IV]], 1
+// CHECK-NEXT:    br label [[OMP_LOOP_HEADER]]
+// CHECK:       omp_loop.exit:
+// CHECK-NEXT:    call void @__kmpc_for_static_fini(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+// CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM9:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+// CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2:@.*]], i32 [[OMP_GLOBAL_THREAD_NUM9]])
+// CHECK-NEXT:    br label [[OMP_LOOP_AFTER:%.*]]
+// CHECK:       omp_loop.after:
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@__captured_stmt
+// CHECK-SAME: (i32* nonnull align 4 dereferenceable(4) [[DISTANCE:%.*]], %struct.anon* noalias [[__CONTEXT:%.*]]) [[ATTR1:#.*]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DISTANCE_ADDR:%.*]] = alloca i32*, align 8
+// CHECK-NEXT:    [[__CONTEXT_ADDR:%.*]] = alloca %struct.anon*, align 8
+// CHECK-NEXT:    store i32* [[DISTANCE]], i32** [[DISTANCE_ADDR]], align 8
+// CHECK-NEXT:    store %struct.anon* [[__CONTEXT]], %struct.anon** [[__CONTEXT_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load %struct.anon*, %struct.anon** [[__CONTEXT_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_ANON:%.*]], %struct.anon* [[TMP0]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32*, i32** [[TMP1]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+// CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[TMP3]], 32000000
+// CHECK-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK:       cond.true:
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP0]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4
+// CHECK-NEXT:    [[SUB:%.*]] = sub i32 32000000, [[TMP6]]
+// CHECK-NEXT:    [[DIV:%.*]] = udiv i32 [[SUB]], 7
+// CHECK-NEXT:    br label [[COND_END:%.*]]
+// CHECK:       cond.false:
+// CHECK-NEXT:    br label [[COND_END]]
+// CHECK:       cond.end:
+// CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[DIV]], [[COND_TRUE]] ], [ 0, [[COND_FALSE]] ]
+// CHECK-NEXT:    [[TMP7:%.*]] = load i32*, i32** [[DISTANCE_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[COND]], i32* [[TMP7]], align 4
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@__captured_stmt.1
+// CHECK-SAME: (i32* nonnull align 4 dereferenceable(4) [[LOOPVAR:%.*]], i32 [[LOGICAL:%.*]], %struct.anon.0* noalias [[__CONTEXT:%.*]]) [[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[LOOPVAR_ADDR:%.*]] = alloca i32*, align 8
+// CHECK-NEXT:    [[LOGICAL_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__CONTEXT_ADDR:%.*]] = alloca %struct.anon.0*, align 8
+// CHECK-NEXT:    store i32* [[LOOPVAR]], i32** [[LOOPVAR_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[LOGICAL]], i32* [[LOGICAL_ADDR]], align 4
+// CHECK-NEXT:    store %struct.anon.0* [[__CONTEXT]], %struct.anon.0** [[__CONTEXT_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load %struct.anon.0*, %struct.anon.0** [[__CONTEXT_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_ANON_0:%.*]], %struct.anon.0* [[TMP0]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[LOGICAL_ADDR]], align 4
+// CHECK-NEXT:    [[MUL:%.*]] = mul i32 7, [[TMP3]]
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[TMP2]], [[MUL]]
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[LOOPVAR_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[ADD]], i32* [[TMP4]], align 4
+// CHECK-NEXT:    ret void
+//
Index: clang/test/OpenMP/irbuilder_for_rangefor.cpp
===================================================================
--- /dev/null
+++ clang/test/OpenMP/irbuilder_for_rangefor.cpp
@@ -0,0 +1,164 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs
+// RUN: %clang_cc1 -fopenmp-enable-irbuilder -verify -fopenmp -fopenmp-version=50 -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+struct MyIterator {
+  MyIterator(unsigned pos);
+  MyIterator(const MyIterator &other);
+  const MyIterator &operator=(const MyIterator &that);
+  MyIterator &operator++();
+  int operator-(const MyIterator &that) const;
+  MyIterator &operator+=(unsigned a);
+  MyIterator operator+(unsigned a) const;
+  bool operator==(const MyIterator &that) const;
+  bool operator!=(const MyIterator &that) const;
+  unsigned operator*() const;
+};
+
+struct MyRange {
+  MyRange(int n);
+
+  MyIterator begin();
+  MyIterator end();
+};
+
+extern "C" void workshareloop_rangefor(float *a, float *b, float *c) {
+#pragma omp for
+  for (unsigned i : MyRange(42)) {
+    a[i] = b[i] * c[i];
+  }
+}
+
+#endif // HEADER
+// CHECK-LABEL: define {{[^@]+}}@workshareloop_rangefor
+// CHECK-SAME: (float* [[A:%.*]], float* [[B:%.*]], float* [[C:%.*]]) [[ATTR0:#.*]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca float*, align 8
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca float*, align 8
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca float*, align 8
+// CHECK-NEXT:    [[__RANGE2:%.*]] = alloca %struct.MyRange*, align 8
+// CHECK-NEXT:    [[REF_TMP:%.*]] = alloca [[STRUCT_MYRANGE:%.*]], align 1
+// CHECK-NEXT:    [[__BEGIN2:%.*]] = alloca [[STRUCT_MYITERATOR:%.*]], align 1
+// CHECK-NEXT:    [[__END2:%.*]] = alloca [[STRUCT_MYITERATOR]], align 1
+// CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON:%.*]], align 8
+// CHECK-NEXT:    [[DOTCOUNT_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    [[AGG_CAPTURED1:%.*]] = alloca [[STRUCT_ANON_0:%.*]], align 1
+// CHECK-NEXT:    [[P_LASTITER:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[P_LOWERBOUND:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    [[P_UPPERBOUND:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    [[P_STRIDE:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    store float* [[A]], float** [[A_ADDR]], align 8
+// CHECK-NEXT:    store float* [[B]], float** [[B_ADDR]], align 8
+// CHECK-NEXT:    store float* [[C]], float** [[C_ADDR]], align 8
+// CHECK-NEXT:    call void @_ZN7MyRangeC1Ei(%struct.MyRange* nonnull dereferenceable(1) [[REF_TMP]], i32 42)
+// CHECK-NEXT:    store %struct.MyRange* [[REF_TMP]], %struct.MyRange** [[__RANGE2]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load %struct.MyRange*, %struct.MyRange** [[__RANGE2]], align 8
+// CHECK-NEXT:    call void @_ZN7MyRange5beginEv(%struct.MyIterator* sret(%struct.MyIterator) align 1 [[__BEGIN2]], %struct.MyRange* nonnull dereferenceable(1) [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = load %struct.MyRange*, %struct.MyRange** [[__RANGE2]], align 8
+// CHECK-NEXT:    call void @_ZN7MyRange3endEv(%struct.MyIterator* sret(%struct.MyIterator) align 1 [[__END2]], %struct.MyRange* nonnull dereferenceable(1) [[TMP1]])
+// CHECK-NEXT:    [[CALL:%.*]] = call i32 @_ZNK10MyIteratordeEv(%struct.MyIterator* nonnull dereferenceable(1) [[__BEGIN2]])
+// CHECK-NEXT:    store i32 [[CALL]], i32* [[I]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[AGG_CAPTURED]], i32 0, i32 0
+// CHECK-NEXT:    store %struct.MyIterator* [[__BEGIN2]], %struct.MyIterator** [[TMP2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[AGG_CAPTURED]], i32 0, i32 1
+// CHECK-NEXT:    store %struct.MyIterator* [[__END2]], %struct.MyIterator** [[TMP3]], align 8
+// CHECK-NEXT:    call void @__captured_stmt(i64* [[DOTCOUNT_ADDR]], %struct.anon* [[AGG_CAPTURED]])
+// CHECK-NEXT:    [[DOTCOUNT:%.*]] = load i64, i64* [[DOTCOUNT_ADDR]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_ANON_0]], %struct.anon.0* [[AGG_CAPTURED1]], i32 0, i32 0
+// CHECK-NEXT:    call void @_ZN10MyIteratorC1ERKS_(%struct.MyIterator* nonnull dereferenceable(1) [[TMP4]], %struct.MyIterator* nonnull align 1 dereferenceable(1) [[__BEGIN2]])
+// CHECK-NEXT:    br label [[OMP_LOOP_PREHEADER:%.*]]
+// CHECK:       omp_loop.preheader:
+// CHECK-NEXT:    store i64 0, i64* [[P_LOWERBOUND]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = sub i64 [[DOTCOUNT]], 1
+// CHECK-NEXT:    store i64 [[TMP5]], i64* [[P_UPPERBOUND]], align 8
+// CHECK-NEXT:    store i64 1, i64* [[P_STRIDE]], align 8
+// CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1:@.*]])
+// CHECK-NEXT:    call void @__kmpc_for_static_init_8u(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]], i32 34, i32* [[P_LASTITER]], i64* [[P_LOWERBOUND]], i64* [[P_UPPERBOUND]], i64* [[P_STRIDE]], i64 1, i64 1)
+// CHECK-NEXT:    [[TMP6:%.*]] = load i64, i64* [[P_LOWERBOUND]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = load i64, i64* [[P_UPPERBOUND]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP7]], [[TMP6]]
+// CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[TMP8]], 1
+// CHECK-NEXT:    br label [[OMP_LOOP_HEADER:%.*]]
+// CHECK:       omp_loop.header:
+// CHECK-NEXT:    [[OMP_LOOP_IV:%.*]] = phi i64 [ 0, [[OMP_LOOP_PREHEADER]] ], [ [[OMP_LOOP_NEXT:%.*]], [[OMP_LOOP_INC:%.*]] ]
+// CHECK-NEXT:    br label [[OMP_LOOP_COND:%.*]]
+// CHECK:       omp_loop.cond:
+// CHECK-NEXT:    [[OMP_LOOP_CMP:%.*]] = icmp ult i64 [[OMP_LOOP_IV]], [[TMP9]]
+// CHECK-NEXT:    br i1 [[OMP_LOOP_CMP]], label [[OMP_LOOP_BODY:%.*]], label [[OMP_LOOP_EXIT:%.*]]
+// CHECK:       omp_loop.body:
+// CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[OMP_LOOP_IV]], [[TMP6]]
+// CHECK-NEXT:    call void @__captured_stmt.1(i32* [[I]], i64 [[TMP10]], %struct.anon.0* [[AGG_CAPTURED1]])
+// CHECK-NEXT:    [[TMP11:%.*]] = load float*, float** [[B_ADDR]], align 8
+// CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* [[I]], align 4
+// CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP12]] to i64
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[TMP11]], i64 [[IDXPROM]]
+// CHECK-NEXT:    [[TMP13:%.*]] = load float, float* [[ARRAYIDX]], align 4
+// CHECK-NEXT:    [[TMP14:%.*]] = load float*, float** [[C_ADDR]], align 8
+// CHECK-NEXT:    [[TMP15:%.*]] = load i32, i32* [[I]], align 4
+// CHECK-NEXT:    [[IDXPROM2:%.*]] = zext i32 [[TMP15]] to i64
+// CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 [[IDXPROM2]]
+// CHECK-NEXT:    [[TMP16:%.*]] = load float, float* [[ARRAYIDX3]], align 4
+// CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP13]], [[TMP16]]
+// CHECK-NEXT:    [[TMP17:%.*]] = load float*, float** [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = load i32, i32* [[I]], align 4
+// CHECK-NEXT:    [[IDXPROM4:%.*]] = zext i32 [[TMP18]] to i64
+// CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[TMP17]], i64 [[IDXPROM4]]
+// CHECK-NEXT:    store float [[MUL]], float* [[ARRAYIDX5]], align 4
+// CHECK-NEXT:    br label [[OMP_LOOP_INC]]
+// CHECK:       omp_loop.inc:
+// CHECK-NEXT:    [[OMP_LOOP_NEXT]] = add nuw i64 [[OMP_LOOP_IV]], 1
+// CHECK-NEXT:    br label [[OMP_LOOP_HEADER]]
+// CHECK:       omp_loop.exit:
+// CHECK-NEXT:    call void @__kmpc_for_static_fini(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+// CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+// CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2:@.*]], i32 [[OMP_GLOBAL_THREAD_NUM6]])
+// CHECK-NEXT:    br label [[OMP_LOOP_AFTER:%.*]]
+// CHECK:       omp_loop.after:
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@__captured_stmt
+// CHECK-SAME: (i64* nonnull align 8 dereferenceable(8) [[DISTANCE:%.*]], %struct.anon* noalias [[__CONTEXT:%.*]]) [[ATTR2:#.*]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DISTANCE_ADDR:%.*]] = alloca i64*, align 8
+// CHECK-NEXT:    [[__CONTEXT_ADDR:%.*]] = alloca %struct.anon*, align 8
+// CHECK-NEXT:    store i64* [[DISTANCE]], i64** [[DISTANCE_ADDR]], align 8
+// CHECK-NEXT:    store %struct.anon* [[__CONTEXT]], %struct.anon** [[__CONTEXT_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load %struct.anon*, %struct.anon** [[__CONTEXT_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_ANON:%.*]], %struct.anon* [[TMP0]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load %struct.MyIterator*, %struct.MyIterator** [[TMP1]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP0]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load %struct.MyIterator*, %struct.MyIterator** [[TMP3]], align 8
+// CHECK-NEXT:    [[CALL:%.*]] = call i32 @_ZNK10MyIteratormiERKS_(%struct.MyIterator* nonnull dereferenceable(1) [[TMP2]], %struct.MyIterator* nonnull align 1 dereferenceable(1) [[TMP4]])
+// CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[CALL]] to i64
+// CHECK-NEXT:    [[DIV:%.*]] = udiv i64 [[CONV]], 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load i64*, i64** [[DISTANCE_ADDR]], align 8
+// CHECK-NEXT:    store i64 [[DIV]], i64* [[TMP5]], align 8
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@__captured_stmt.1
+// CHECK-SAME: (i32* nonnull align 4 dereferenceable(4) [[LOOPVAR:%.*]], i64 [[LOGICAL:%.*]], %struct.anon.0* noalias [[__CONTEXT:%.*]]) [[ATTR2]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[LOOPVAR_ADDR:%.*]] = alloca i32*, align 8
+// CHECK-NEXT:    [[LOGICAL_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    [[__CONTEXT_ADDR:%.*]] = alloca %struct.anon.0*, align 8
+// CHECK-NEXT:    [[REF_TMP:%.*]] = alloca [[STRUCT_MYITERATOR:%.*]], align 1
+// CHECK-NEXT:    store i32* [[LOOPVAR]], i32** [[LOOPVAR_ADDR]], align 8
+// CHECK-NEXT:    store i64 [[LOGICAL]], i64* [[LOGICAL_ADDR]], align 8
+// CHECK-NEXT:    store %struct.anon.0* [[__CONTEXT]], %struct.anon.0** [[__CONTEXT_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load %struct.anon.0*, %struct.anon.0** [[__CONTEXT_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_ANON_0:%.*]], %struct.anon.0* [[TMP0]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP2:%.*]] = load i64, i64* [[LOGICAL_ADDR]], align 8
+// CHECK-NEXT:    [[MUL:%.*]] = mul i64 1, [[TMP2]]
+// CHECK-NEXT:    [[CONV:%.*]] = trunc i64 [[MUL]] to i32
+// CHECK-NEXT:    call void @_ZNK10MyIteratorplEj(%struct.MyIterator* sret(%struct.MyIterator) align 1 [[REF_TMP]], %struct.MyIterator* nonnull dereferenceable(1) [[TMP1]], i32 [[CONV]])
+// CHECK-NEXT:    [[CALL:%.*]] = call i32 @_ZNK10MyIteratordeEv(%struct.MyIterator* nonnull dereferenceable(1) [[REF_TMP]])
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32*, i32** [[LOOPVAR_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[CALL]], i32* [[TMP3]], align 4
+// CHECK-NEXT:    ret void
+//
Index: clang/test/OpenMP/irbuilder_for_iterator.cpp
===================================================================
--- /dev/null
+++ clang/test/OpenMP/irbuilder_for_iterator.cpp
@@ -0,0 +1,147 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs
+// RUN: %clang_cc1 -fopenmp-enable-irbuilder -verify -fopenmp -fopenmp-version=50 -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+struct MyIterator {
+  MyIterator(unsigned pos);
+  MyIterator(const MyIterator &other);
+  const MyIterator &operator=(const MyIterator &that);
+  MyIterator &operator++();
+  int operator-(const MyIterator &that) const;
+  MyIterator &operator+=(unsigned a);
+  MyIterator operator+(unsigned a) const;
+  bool operator==(const MyIterator &that) const;
+  bool operator!=(const MyIterator &that) const;
+  unsigned operator*() const;
+};
+
+extern "C" void workshareloop_iterator(float *a, float *b, float *c) {
+#pragma omp for
+  for (MyIterator it = MyIterator(7); it != MyIterator(41); ++it) {
+    unsigned i = *it;
+    a[i] = b[i] * c[i];
+  }
+}
+
+#endif // HEADER
+// CHECK-LABEL: define {{[^@]+}}@workshareloop_iterator
+// CHECK-SAME: (float* [[A:%.*]], float* [[B:%.*]], float* [[C:%.*]]) [[ATTR0:#.*]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca float*, align 8
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca float*, align 8
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca float*, align 8
+// CHECK-NEXT:    [[IT:%.*]] = alloca [[STRUCT_MYITERATOR:%.*]], align 1
+// CHECK-NEXT:    [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON:%.*]], align 8
+// CHECK-NEXT:    [[DOTCOUNT_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    [[AGG_CAPTURED1:%.*]] = alloca [[STRUCT_ANON_0:%.*]], align 1
+// CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[P_LASTITER:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[P_LOWERBOUND:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    [[P_UPPERBOUND:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    [[P_STRIDE:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    store float* [[A]], float** [[A_ADDR]], align 8
+// CHECK-NEXT:    store float* [[B]], float** [[B_ADDR]], align 8
+// CHECK-NEXT:    store float* [[C]], float** [[C_ADDR]], align 8
+// CHECK-NEXT:    call void @_ZN10MyIteratorC1Ej(%struct.MyIterator* nonnull dereferenceable(1) [[IT]], i32 7)
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[AGG_CAPTURED]], i32 0, i32 0
+// CHECK-NEXT:    store %struct.MyIterator* [[IT]], %struct.MyIterator** [[TMP0]], align 8
+// CHECK-NEXT:    call void @__captured_stmt(i64* [[DOTCOUNT_ADDR]], %struct.anon* [[AGG_CAPTURED]])
+// CHECK-NEXT:    [[DOTCOUNT:%.*]] = load i64, i64* [[DOTCOUNT_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_ANON_0]], %struct.anon.0* [[AGG_CAPTURED1]], i32 0, i32 0
+// CHECK-NEXT:    call void @_ZN10MyIteratorC1ERKS_(%struct.MyIterator* nonnull dereferenceable(1) [[TMP1]], %struct.MyIterator* nonnull align 1 dereferenceable(1) [[IT]])
+// CHECK-NEXT:    br label [[OMP_LOOP_PREHEADER:%.*]]
+// CHECK:       omp_loop.preheader:
+// CHECK-NEXT:    store i64 0, i64* [[P_LOWERBOUND]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[DOTCOUNT]], 1
+// CHECK-NEXT:    store i64 [[TMP2]], i64* [[P_UPPERBOUND]], align 8
+// CHECK-NEXT:    store i64 1, i64* [[P_STRIDE]], align 8
+// CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1:@.*]])
+// CHECK-NEXT:    call void @__kmpc_for_static_init_8u(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]], i32 34, i32* [[P_LASTITER]], i64* [[P_LOWERBOUND]], i64* [[P_UPPERBOUND]], i64* [[P_STRIDE]], i64 1, i64 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = load i64, i64* [[P_LOWERBOUND]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = load i64, i64* [[P_UPPERBOUND]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = sub i64 [[TMP4]], [[TMP3]]
+// CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[TMP5]], 1
+// CHECK-NEXT:    br label [[OMP_LOOP_HEADER:%.*]]
+// CHECK:       omp_loop.header:
+// CHECK-NEXT:    [[OMP_LOOP_IV:%.*]] = phi i64 [ 0, [[OMP_LOOP_PREHEADER]] ], [ [[OMP_LOOP_NEXT:%.*]], [[OMP_LOOP_INC:%.*]] ]
+// CHECK-NEXT:    br label [[OMP_LOOP_COND:%.*]]
+// CHECK:       omp_loop.cond:
+// CHECK-NEXT:    [[OMP_LOOP_CMP:%.*]] = icmp ult i64 [[OMP_LOOP_IV]], [[TMP6]]
+// CHECK-NEXT:    br i1 [[OMP_LOOP_CMP]], label [[OMP_LOOP_BODY:%.*]], label [[OMP_LOOP_EXIT:%.*]]
+// CHECK:       omp_loop.body:
+// CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[OMP_LOOP_IV]], [[TMP3]]
+// CHECK-NEXT:    call void @__captured_stmt.1(%struct.MyIterator* [[IT]], i64 [[TMP7]], %struct.anon.0* [[AGG_CAPTURED1]])
+// CHECK-NEXT:    [[CALL:%.*]] = call i32 @_ZNK10MyIteratordeEv(%struct.MyIterator* nonnull dereferenceable(1) [[IT]])
+// CHECK-NEXT:    store i32 [[CALL]], i32* [[I]], align 4
+// CHECK-NEXT:    [[TMP8:%.*]] = load float*, float** [[B_ADDR]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* [[I]], align 4
+// CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP9]] to i64
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[TMP8]], i64 [[IDXPROM]]
+// CHECK-NEXT:    [[TMP10:%.*]] = load float, float* [[ARRAYIDX]], align 4
+// CHECK-NEXT:    [[TMP11:%.*]] = load float*, float** [[C_ADDR]], align 8
+// CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* [[I]], align 4
+// CHECK-NEXT:    [[IDXPROM2:%.*]] = zext i32 [[TMP12]] to i64
+// CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[TMP11]], i64 [[IDXPROM2]]
+// CHECK-NEXT:    [[TMP13:%.*]] = load float, float* [[ARRAYIDX3]], align 4
+// CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP10]], [[TMP13]]
+// CHECK-NEXT:    [[TMP14:%.*]] = load float*, float** [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP15:%.*]] = load i32, i32* [[I]], align 4
+// CHECK-NEXT:    [[IDXPROM4:%.*]] = zext i32 [[TMP15]] to i64
+// CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 [[IDXPROM4]]
+// CHECK-NEXT:    store float [[MUL]], float* [[ARRAYIDX5]], align 4
+// CHECK-NEXT:    br label [[OMP_LOOP_INC]]
+// CHECK:       omp_loop.inc:
+// CHECK-NEXT:    [[OMP_LOOP_NEXT]] = add nuw i64 [[OMP_LOOP_IV]], 1
+// CHECK-NEXT:    br label [[OMP_LOOP_HEADER]]
+// CHECK:       omp_loop.exit:
+// CHECK-NEXT:    call void @__kmpc_for_static_fini(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+// CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+// CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2:@.*]], i32 [[OMP_GLOBAL_THREAD_NUM6]])
+// CHECK-NEXT:    br label [[OMP_LOOP_AFTER:%.*]]
+// CHECK:       omp_loop.after:
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@__captured_stmt
+// CHECK-SAME: (i64* nonnull align 8 dereferenceable(8) [[DISTANCE:%.*]], %struct.anon* noalias [[__CONTEXT:%.*]]) [[ATTR2:#.*]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DISTANCE_ADDR:%.*]] = alloca i64*, align 8
+// CHECK-NEXT:    [[__CONTEXT_ADDR:%.*]] = alloca %struct.anon*, align 8
+// CHECK-NEXT:    [[REF_TMP:%.*]] = alloca [[STRUCT_MYITERATOR:%.*]], align 1
+// CHECK-NEXT:    store i64* [[DISTANCE]], i64** [[DISTANCE_ADDR]], align 8
+// CHECK-NEXT:    store %struct.anon* [[__CONTEXT]], %struct.anon** [[__CONTEXT_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load %struct.anon*, %struct.anon** [[__CONTEXT_ADDR]], align 8
+// CHECK-NEXT:    call void @_ZN10MyIteratorC1Ej(%struct.MyIterator* nonnull dereferenceable(1) [[REF_TMP]], i32 41)
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_ANON:%.*]], %struct.anon* [[TMP0]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP2:%.*]] = load %struct.MyIterator*, %struct.MyIterator** [[TMP1]], align 8
+// CHECK-NEXT:    [[CALL:%.*]] = call i32 @_ZNK10MyIteratormiERKS_(%struct.MyIterator* nonnull dereferenceable(1) [[REF_TMP]], %struct.MyIterator* nonnull align 1 dereferenceable(1) [[TMP2]])
+// CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[CALL]] to i64
+// CHECK-NEXT:    [[DIV:%.*]] = udiv i64 [[CONV]], 1
+// CHECK-NEXT:    [[TMP3:%.*]] = load i64*, i64** [[DISTANCE_ADDR]], align 8
+// CHECK-NEXT:    store i64 [[DIV]], i64* [[TMP3]], align 8
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@__captured_stmt.1
+// CHECK-SAME: (%struct.MyIterator* nonnull align 1 dereferenceable(1) [[LOOPVAR:%.*]], i64 [[LOGICAL:%.*]], %struct.anon.0* noalias [[__CONTEXT:%.*]]) [[ATTR2]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[LOOPVAR_ADDR:%.*]] = alloca %struct.MyIterator*, align 8
+// CHECK-NEXT:    [[LOGICAL_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    [[__CONTEXT_ADDR:%.*]] = alloca %struct.anon.0*, align 8
+// CHECK-NEXT:    [[REF_TMP:%.*]] = alloca [[STRUCT_MYITERATOR:%.*]], align 1
+// CHECK-NEXT:    store %struct.MyIterator* [[LOOPVAR]], %struct.MyIterator** [[LOOPVAR_ADDR]], align 8
+// CHECK-NEXT:    store i64 [[LOGICAL]], i64* [[LOGICAL_ADDR]], align 8
+// CHECK-NEXT:    store %struct.anon.0* [[__CONTEXT]], %struct.anon.0** [[__CONTEXT_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load %struct.anon.0*, %struct.anon.0** [[__CONTEXT_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_ANON_0:%.*]], %struct.anon.0* [[TMP0]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP2:%.*]] = load i64, i64* [[LOGICAL_ADDR]], align 8
+// CHECK-NEXT:    [[MUL:%.*]] = mul i64 1, [[TMP2]]
+// CHECK-NEXT:    [[CONV:%.*]] = trunc i64 [[MUL]] to i32
+// CHECK-NEXT:    call void @_ZNK10MyIteratorplEj(%struct.MyIterator* sret(%struct.MyIterator) align 1 [[REF_TMP]], %struct.MyIterator* nonnull dereferenceable(1) [[TMP1]], i32 [[CONV]])
+// CHECK-NEXT:    [[TMP3:%.*]] = load %struct.MyIterator*, %struct.MyIterator** [[LOOPVAR_ADDR]], align 8
+// CHECK-NEXT:    [[CALL:%.*]] = call nonnull align 1 dereferenceable(1) %struct.MyIterator* @_ZN10MyIteratoraSERKS_(%struct.MyIterator* nonnull dereferenceable(1) [[TMP3]], %struct.MyIterator* nonnull align 1 dereferenceable(1) [[REF_TMP]])
+// CHECK-NEXT:    ret void
+//
Index: clang/lib/Serialization/ASTWriterStmt.cpp
===================================================================
--- clang/lib/Serialization/ASTWriterStmt.cpp
+++ clang/lib/Serialization/ASTWriterStmt.cpp
@@ -2170,6 +2170,13 @@
 // OpenMP Directives.
 //===----------------------------------------------------------------------===//
 
+void ASTStmtWriter::VisitOMPCanonicalLoop(OMPCanonicalLoop *S) {
+  VisitStmt(S);
+  for (Stmt *SubStmt : S->SubStmts)
+    Record.AddStmt(SubStmt);
+  Code = serialization::STMT_OMP_CANONICAL_LOOP;
+}
+
 void ASTStmtWriter::VisitOMPExecutableDirective(OMPExecutableDirective *E) {
   Record.writeOMPChildren(E->Data);
   Record.AddSourceLocation(E->getBeginLoc());
Index: clang/lib/Serialization/ASTReaderStmt.cpp
===================================================================
--- clang/lib/Serialization/ASTReaderStmt.cpp
+++ clang/lib/Serialization/ASTReaderStmt.cpp
@@ -2273,6 +2273,12 @@
 // OpenMP Directives.
 //===----------------------------------------------------------------------===//
 
+void ASTStmtReader::VisitOMPCanonicalLoop(OMPCanonicalLoop *S) {
+  VisitStmt(S);
+  for (Stmt *&SubStmt : S->SubStmts)
+    SubStmt = Record.readSubStmt();
+}
+
 void ASTStmtReader::VisitOMPExecutableDirective(OMPExecutableDirective *E) {
   Record.readOMPChildren(E->Data);
   E->setLocStart(readSourceLocation());
Index: clang/lib/Sema/TreeTransform.h
===================================================================
--- clang/lib/Sema/TreeTransform.h
+++ clang/lib/Sema/TreeTransform.h
@@ -8322,6 +8322,15 @@
 //===----------------------------------------------------------------------===//
 // OpenMP directive transformation
 //===----------------------------------------------------------------------===//
+
+template <typename Derived>
+StmtResult
+TreeTransform<Derived>::TransformOMPCanonicalLoop(OMPCanonicalLoop *L) {
+  // The OMPCanonicalLoop will be recreated when transforming the loop-associted
+  // directive.
+  return getDerived().TransformStmt(L->getLoopStmt());
+}
+
 template <typename Derived>
 StmtResult TreeTransform<Derived>::TransformOMPExecutableDirective(
     OMPExecutableDirective *D) {
@@ -8358,6 +8367,9 @@
       else
         CS = D->getInnermostCapturedStmt()->getCapturedStmt();
       Body = getDerived().TransformStmt(CS);
+      if (Body.isUsable() && isOpenMPLoopDirective(D->getDirectiveKind()) &&
+          getSema().getLangOpts().OpenMPIRBuilder)
+        Body = getSema().ActOnOpenMPCanonicalLoop(Body.get());
     }
     AssociatedStmt =
         getDerived().getSema().ActOnOpenMPRegionEnd(Body, TClauses);
Index: clang/lib/Sema/SemaOpenMP.cpp
===================================================================
--- clang/lib/Sema/SemaOpenMP.cpp
+++ clang/lib/Sema/SemaOpenMP.cpp
@@ -3767,6 +3767,9 @@
       Visit(E->getBase());
     }
   }
+
+  void VisitOMPCanonicalLoop(OMPCanonicalLoop *S) { VisitStmt(S); }
+
   void VisitOMPExecutableDirective(OMPExecutableDirective *S) {
     for (OMPClause *C : S->clauses()) {
       // Skip analysis of arguments of implicitly defined firstprivate clause
@@ -5130,6 +5133,350 @@
   }
 }
 
+namespace {
+/// Rewrite statements and expressions for Sema \p Actions CurContext.
+/// Used to capture variable references if already parsed statements/expressions
+/// into a CapturedStatement.
+class CaptureVars : public TreeTransform<CaptureVars> {
+  using BaseTransform = TreeTransform<CaptureVars>;
+
+public:
+  CaptureVars(Sema &Actions) : BaseTransform(Actions) {}
+
+  bool AlwaysRebuild() { return true; }
+};
+} // namespace
+
+/// Create a closure that computes the number of iterations of a loop.
+///
+/// \param Actions   The Sema object.
+/// \param LogicalTy Type for the logical iteration number.
+/// \param Rel       Comparison operator of the loop condition.
+/// \param StartExpr Value of the loop counter at the first iteration.
+/// \param StopExpr  Expression the loop counter is compared against in the loop
+/// condition. \param Step      Amount of increment after each iteration.
+///
+/// \return Closure (CapturedStmt) of the distance calculation.
+static CapturedStmt *buildDistanceFunc(Sema &Actions, QualType LogicalTy,
+                                       BinaryOperator::Opcode Rel,
+                                       Expr *StartExpr, Expr *StopExpr,
+                                       Expr *Step) {
+  ASTContext &Ctx = Actions.getASTContext();
+  TypeSourceInfo *LogicalTSI = Ctx.getTrivialTypeSourceInfo(LogicalTy);
+
+  // Captured regions currently don't support return values, we use an
+  // out-parameter instead. All inputs are implicit captures.
+  // TODO: Instead of capturing each DeclRefExpr occurring in
+  // StartExpr/StopExpr/Step, these could also be passed as a value capture.
+  QualType ResultTy = Ctx.getLValueReferenceType(LogicalTy);
+  Sema::CapturedParamNameType Params[] = {{"Distance", ResultTy},
+                                          {StringRef(), QualType()}};
+  Actions.ActOnCapturedRegionStart({}, nullptr, CR_Default, Params);
+
+  Expr *Body;
+  {
+    Sema::CompoundScopeRAII CompoundScope(Actions);
+    CapturedDecl *CS = cast<CapturedDecl>(Actions.CurContext);
+
+    // Get the LValue expression for the result.
+    ImplicitParamDecl *DistParam = CS->getParam(0);
+    DeclRefExpr *DistRef = Actions.BuildDeclRefExpr(
+        DistParam, LogicalTy, VK_LValue, {}, nullptr, nullptr, {}, nullptr);
+
+    // Capture all referenced variable references.
+    CaptureVars Recap(Actions);
+    Expr *NewStart = AssertSuccess(Recap.TransformExpr(StartExpr));
+    Expr *NewStop = AssertSuccess(Recap.TransformExpr(StopExpr));
+    Expr *NewStep = AssertSuccess(Recap.TransformExpr(Step));
+
+    IntegerLiteral *Zero = IntegerLiteral::Create(
+        Ctx, llvm::APInt(Ctx.getIntWidth(LogicalTy), 0), LogicalTy, {});
+    Expr *Dist;
+    if (Rel == BO_NE) {
+      // When using a != comparison, the increment can be +1 or -1. This can be
+      // dynamic at runtime, so we need to check for the direction.
+      Expr *IsNegStep =
+          AssertSuccess(Actions.BuildBinOp(nullptr, {}, BO_LT, NewStep, Zero));
+
+      // Positive increment.
+      Expr *ForwardRange = AssertSuccess(
+          Actions.BuildBinOp(nullptr, {}, BO_Sub, NewStop, NewStart));
+      ForwardRange = AssertSuccess(
+          Actions.BuildCStyleCastExpr({}, LogicalTSI, {}, ForwardRange));
+      Expr *ForwardDist = AssertSuccess(
+          Actions.BuildBinOp(nullptr, {}, BO_Div, ForwardRange, NewStep));
+
+      // Negative increment.
+      Expr *BackwardRange = AssertSuccess(
+          Actions.BuildBinOp(nullptr, {}, BO_Sub, NewStart, NewStop));
+      BackwardRange = AssertSuccess(
+          Actions.BuildCStyleCastExpr({}, LogicalTSI, {}, BackwardRange));
+      Expr *NegIncAmount =
+          AssertSuccess(Actions.BuildUnaryOp(nullptr, {}, UO_Minus, NewStep));
+      Expr *BackwardDist = AssertSuccess(
+          Actions.BuildBinOp(nullptr, {}, BO_Div, BackwardRange, NegIncAmount));
+
+      // Use the appropriate case.
+      Dist = AssertSuccess(Actions.ActOnConditionalOp(
+          {}, {}, IsNegStep, BackwardDist, ForwardDist));
+    } else {
+      assert((Rel == BO_LT || Rel == BO_LE || Rel == BO_GE || Rel == BO_GT) &&
+             "Expected one of these relational operators");
+
+      // We can derive the direction from any other comparison operator. It is
+      // non well-formed OpenMP if Step increments/decrements in the other
+      // directions. Whether at least the first iteration passes the loop
+      // condition.
+      Expr *HasAnyIteration = AssertSuccess(
+          Actions.BuildBinOp(nullptr, {}, Rel, NewStart, NewStop));
+
+      // Compute the range between first and last counter value.
+      Expr *Range;
+      if (Rel == BO_GE || Rel == BO_GT)
+        Range = AssertSuccess(
+            Actions.BuildBinOp(nullptr, {}, BO_Sub, NewStart, NewStop));
+      else
+        Range = AssertSuccess(
+            Actions.BuildBinOp(nullptr, {}, BO_Sub, NewStop, NewStart));
+
+      // Ensure unsigned range space.
+      Range =
+          AssertSuccess(Actions.BuildCStyleCastExpr({}, LogicalTSI, {}, Range));
+
+      if (Rel == BO_LE || Rel == BO_GE) {
+        // Add one to the range if the relational operator is inclusive.
+        Range =
+            AssertSuccess(Actions.BuildUnaryOp(nullptr, {}, UO_PreInc, Range));
+      }
+
+      // Divide by the absolute step amount.
+      if (Rel == BO_GE || Rel == BO_GT)
+        NewStep =
+            AssertSuccess(Actions.BuildUnaryOp(nullptr, {}, UO_Minus, NewStep));
+      Dist = AssertSuccess(
+          Actions.BuildBinOp(nullptr, {}, BO_Div, Range, NewStep));
+
+      // If there is not at least one iteration, the range contains garbage. Fix
+      // to zero in this case.
+      Dist = AssertSuccess(
+          Actions.ActOnConditionalOp({}, {}, HasAnyIteration, Dist, Zero));
+    }
+
+    // Assign the result to the out-parameter.
+    Body = AssertSuccess(Actions.BuildBinOp(Actions.getCurScope(), {},
+                                            BO_Assign, DistRef, Dist));
+  }
+  return cast<CapturedStmt>(
+      AssertSuccess(Actions.ActOnCapturedRegionEnd(Body)));
+}
+
+/// Create a closure that computes the loop variable from the logical iteration
+/// number.
+///
+/// \param Actions   The Sema object.
+/// \param LoopVarTy Type for the loop variable used for result value.
+/// \param LogicalTy Type for the logical iteration number.
+/// \param StartExpr Value of the loop counter at the first iteration.
+/// \param Step      Amount of increment after each iteration.
+/// \param Deref     Whether the loop variable is a dereference of the loop
+/// counter variable.
+///
+/// \return Closure (CapturedStmt) of the loop value calculation.
+static CapturedStmt *buildLoopVarFunc(Sema &Actions, QualType LoopVarTy,
+                                      QualType LogicalTy,
+                                      DeclRefExpr *StartExpr, Expr *Step,
+                                      bool Deref) {
+  ASTContext &Ctx = Actions.getASTContext();
+
+  // Pass the result as an out-parameter. Passing as return value would require
+  // the OpenMPIRBuilder to know additional C/C++ semantics, such as how to
+  // invoke a copy constructor.
+  QualType TargetParamTy = Ctx.getLValueReferenceType(LoopVarTy);
+  Sema::CapturedParamNameType Params[] = {{"LoopVar", TargetParamTy},
+                                          {"Logical", LogicalTy},
+                                          {StringRef(), QualType()}};
+  Actions.ActOnCapturedRegionStart({}, nullptr, CR_Default, Params);
+
+  // Capture the initial iterator which represents the LoopVar value at the
+  // zero's logical iteration. Since the original ForStmt/CXXRangeForStmt update
+  // it in every iteration, capture it by value before it is modified.
+  VarDecl *StartVar = cast<VarDecl>(StartExpr->getDecl());
+  bool Invalid = Actions.tryCaptureVariable(StartVar, {},
+                                            Sema::TryCapture_ExplicitByVal, {});
+  (void)Invalid;
+  assert(!Invalid && "Expecting capture-by-value to work.");
+
+  Expr *Body;
+  {
+    Sema::CompoundScopeRAII CompoundScope(Actions);
+    auto *CS = cast<CapturedDecl>(Actions.CurContext);
+
+    ImplicitParamDecl *TargetParam = CS->getParam(0);
+    DeclRefExpr *TargetRef = Actions.BuildDeclRefExpr(
+        TargetParam, LoopVarTy, VK_LValue, {}, nullptr, nullptr, {}, nullptr);
+    ImplicitParamDecl *IndvarParam = CS->getParam(1);
+    DeclRefExpr *LogicalRef = Actions.BuildDeclRefExpr(
+        IndvarParam, LogicalTy, VK_LValue, {}, nullptr, nullptr, {}, nullptr);
+
+    // Capture the Start expression.
+    CaptureVars Recap(Actions);
+    Expr *NewStart = AssertSuccess(Recap.TransformExpr(StartExpr));
+    Expr *NewStep = AssertSuccess(Recap.TransformExpr(Step));
+
+    Expr *Skip = AssertSuccess(
+        Actions.BuildBinOp(nullptr, {}, BO_Mul, NewStep, LogicalRef));
+    // TODO: Explicitly cast to the iterator's difference_type instead of
+    // relying on implicit conversion.
+    Expr *Advanced =
+        AssertSuccess(Actions.BuildBinOp(nullptr, {}, BO_Add, NewStart, Skip));
+
+    if (Deref) {
+      // For range-based for-loops convert the loop counter value to a concrete
+      // loop variable value by dereferencing the iterator.
+      Advanced =
+          AssertSuccess(Actions.BuildUnaryOp(nullptr, {}, UO_Deref, Advanced));
+    }
+
+    // Assign the result to the output parameter.
+    Body = AssertSuccess(Actions.BuildBinOp(Actions.getCurScope(), {},
+                                            BO_Assign, TargetRef, Advanced));
+  }
+  return cast<CapturedStmt>(
+      AssertSuccess(Actions.ActOnCapturedRegionEnd(Body)));
+}
+
+StmtResult Sema::ActOnOpenMPCanonicalLoop(Stmt *AStmt) {
+  ASTContext &Ctx = getASTContext();
+
+  // Extract the common elements of ForStmt and CXXForRangeStmt:
+  // Loop variable, repeat condition, increment
+  Expr *Cond, *Inc;
+  VarDecl *CounterDecl, *LVDecl;
+  if (auto *For = dyn_cast<ForStmt>(AStmt)) {
+    Stmt *Init = For->getInit();
+    if (auto *LCVarDeclStmt = dyn_cast<DeclStmt>(Init)) {
+      // For statement declares loop variable.
+      CounterDecl = cast<VarDecl>(LCVarDeclStmt->getSingleDecl());
+    } else if (auto *LCAssign = cast<BinaryOperator>(Init)) {
+      // For statement reuses variable.
+      assert(LCAssign->getOpcode() == BO_Assign &&
+             "init part must be a loop variable assignment");
+      auto *CounterRef = cast<DeclRefExpr>(LCAssign->getLHS());
+      CounterDecl = cast<VarDecl>(CounterRef->getDecl());
+    } else
+      llvm_unreachable("Cannot determine loop variable");
+    LVDecl = CounterDecl;
+
+    Cond = For->getCond();
+    Inc = For->getInc();
+  } else if (auto *RangeFor = dyn_cast<CXXForRangeStmt>(AStmt)) {
+    DeclStmt *BeginStmt = RangeFor->getBeginStmt();
+    CounterDecl = cast<VarDecl>(BeginStmt->getSingleDecl());
+    LVDecl = RangeFor->getLoopVariable();
+
+    Cond = RangeFor->getCond();
+    Inc = RangeFor->getInc();
+  } else
+    llvm_unreachable("unhandled kind of loop");
+
+  QualType CounterTy = CounterDecl->getType();
+  QualType LVTy = LVDecl->getType();
+
+  // Analyze the loop condition.
+  Expr *LHS, *RHS;
+  BinaryOperator::Opcode CondRel;
+  Cond = Cond->IgnoreImplicit();
+  if (auto *CondBinExpr = dyn_cast<BinaryOperator>(Cond)) {
+    LHS = CondBinExpr->getLHS();
+    RHS = CondBinExpr->getRHS();
+    CondRel = CondBinExpr->getOpcode();
+  } else if (auto *CondCXXOp = dyn_cast<CXXOperatorCallExpr>(Cond)) {
+    assert(CondCXXOp->getOperator() == OO_ExclaimEqual &&
+           "Expected != loop condition for iterator-based loops");
+    assert(CondCXXOp->getNumArgs() == 2 && "Comparison should have 2 operands");
+    LHS = CondCXXOp->getArg(0);
+    RHS = CondCXXOp->getArg(1);
+    CondRel = BO_NE;
+  } else
+    llvm_unreachable("unexpected loop condition");
+
+  // Normalize such that the loop counter is on the LHS.
+  if (!isa<DeclRefExpr>(LHS->IgnoreImplicit()) ||
+      cast<DeclRefExpr>(LHS->IgnoreImplicit())->getDecl() != CounterDecl) {
+    std::swap(LHS, RHS);
+    CondRel = BinaryOperator::reverseComparisonOp(CondRel);
+  }
+  auto *CounterRef = cast<DeclRefExpr>(LHS->IgnoreImplicit());
+
+  // Decide the bit width for the logical iteration counter. By default use the
+  // unsigned ptrdiff_t integer size (for iterators and pointers).
+  // TODO: For iterators, use iterator::difference_type,
+  // std::iterator_traits<>::difference_type or decltype(it - end).
+  QualType LogicalTy = Ctx.getUnsignedPointerDiffType();
+  if (CounterTy->isIntegerType()) {
+    unsigned BitWidth = Ctx.getIntWidth(CounterTy);
+    LogicalTy = Ctx.getIntTypeForBitwidth(BitWidth, false);
+  }
+
+  // Analyze the loop increment.
+  Expr *Step;
+  if (auto *IncUn = dyn_cast<UnaryOperator>(Inc)) {
+    int Direction;
+    switch (IncUn->getOpcode()) {
+    case UO_PreInc:
+    case UO_PostInc:
+      Direction = 1;
+      break;
+    case UO_PreDec:
+    case UO_PostDec:
+      Direction = -1;
+      break;
+    default:
+      llvm_unreachable("unhandled unary increment operator");
+    }
+    Step = IntegerLiteral::Create(
+        Ctx, llvm::APInt(Ctx.getIntWidth(LogicalTy), Direction), LogicalTy, {});
+  } else if (auto *IncBin = dyn_cast<BinaryOperator>(Inc)) {
+    if (IncBin->getOpcode() == BO_AddAssign) {
+      Step = IncBin->getRHS();
+    } else if (IncBin->getOpcode() == BO_SubAssign) {
+      Step =
+          AssertSuccess(BuildUnaryOp(nullptr, {}, UO_Minus, IncBin->getRHS()));
+    } else
+      llvm_unreachable("unhandled binary increment operator");
+  } else if (auto *CondCXXOp = dyn_cast<CXXOperatorCallExpr>(Inc)) {
+    switch (CondCXXOp->getOperator()) {
+    case OO_PlusPlus:
+      Step = IntegerLiteral::Create(
+          Ctx, llvm::APInt(Ctx.getIntWidth(LogicalTy), 1), LogicalTy, {});
+      break;
+    case OO_MinusMinus:
+      Step = IntegerLiteral::Create(
+          Ctx, llvm::APInt(Ctx.getIntWidth(LogicalTy), -1), LogicalTy, {});
+      break;
+    case OO_PlusEqual:
+      Step = CondCXXOp->getArg(1);
+      break;
+    case OO_MinusEqual:
+      Step = AssertSuccess(
+          BuildUnaryOp(nullptr, {}, UO_Minus, CondCXXOp->getArg(1)));
+      break;
+    default:
+      llvm_unreachable("unhandled overloaded increment operator");
+    }
+  } else
+    llvm_unreachable("unknown increment expression");
+
+  CapturedStmt *DistanceFunc =
+      buildDistanceFunc(*this, LogicalTy, CondRel, LHS, RHS, Step);
+  CapturedStmt *LoopVarFunc = buildLoopVarFunc(
+      *this, LVTy, LogicalTy, CounterRef, Step, isa<CXXForRangeStmt>(AStmt));
+  DeclRefExpr *LVRef = BuildDeclRefExpr(LVDecl, LVDecl->getType(), VK_LValue,
+                                        {}, nullptr, nullptr, {}, nullptr);
+  return OMPCanonicalLoop::create(getASTContext(), AStmt, DistanceFunc,
+                                  LoopVarFunc, LVRef);
+}
+
 StmtResult Sema::ActOnOpenMPExecutableDirective(
     OpenMPDirectiveKind Kind, const DeclarationNameInfo &DirName,
     OpenMPDirectiveKind CancelRegion, ArrayRef<OMPClause *> Clauses,
@@ -7953,6 +8300,8 @@
   // OpenMP [2.9.1, Canonical Loop Form]
   //   for (init-expr; test-expr; incr-expr) structured-block
   //   for (range-decl: range-expr) structured-block
+  if (auto CanonLoop = dyn_cast_or_null<OMPCanonicalLoop>(S))
+    S = CanonLoop->getLoopStmt();
   auto *For = dyn_cast_or_null<ForStmt>(S);
   auto *CXXFor = dyn_cast_or_null<CXXForRangeStmt>(S);
   // Ranged for is supported only in OpenMP 5.0.
@@ -8286,6 +8635,17 @@
   return PostUpdate;
 }
 
+static Stmt *findLoopBody(Stmt *L) {
+  if (auto *CanonLoop = dyn_cast_or_null<OMPCanonicalLoop>(L))
+    L = CanonLoop->getLoopStmt();
+  if (auto *For = dyn_cast<ForStmt>(L))
+    return For->getBody();
+
+  assert(isa<CXXForRangeStmt>(L) &&
+         "Expected canonical for or range-based for loops.");
+  return cast<CXXForRangeStmt>(L)->getBody();
+}
+
 /// Called on a for stmt to check itself and nested loops (if any).
 /// \return Returns 0 if one of the collapsed stmts is not canonical for loop,
 /// number of collapsed loops otherwise.
@@ -8346,13 +8706,7 @@
     // All loops associated with the construct must be perfectly nested; that
     // is, there must be no intervening code nor any OpenMP directive between
     // any two loops.
-    if (auto *For = dyn_cast<ForStmt>(CurStmt)) {
-      CurStmt = For->getBody();
-    } else {
-      assert(isa<CXXForRangeStmt>(CurStmt) &&
-             "Expected canonical for or range-based for loops.");
-      CurStmt = cast<CXXForRangeStmt>(CurStmt)->getBody();
-    }
+    CurStmt = findLoopBody(CurStmt);
     CurStmt = OMPLoopDirective::tryToFindNextInnerLoop(
         CurStmt, SemaRef.LangOpts.OpenMP >= 50);
   }
@@ -8374,13 +8728,7 @@
     // All loops associated with the construct must be perfectly nested; that
     // is, there must be no intervening code nor any OpenMP directive between
     // any two loops.
-    if (auto *For = dyn_cast<ForStmt>(CurStmt)) {
-      CurStmt = For->getBody();
-    } else {
-      assert(isa<CXXForRangeStmt>(CurStmt) &&
-             "Expected canonical for or range-based for loops.");
-      CurStmt = cast<CXXForRangeStmt>(CurStmt)->getBody();
-    }
+    CurStmt = findLoopBody(CurStmt);
     CurStmt = OMPLoopDirective::tryToFindNextInnerLoop(
         CurStmt, SemaRef.LangOpts.OpenMP >= 50);
   }
Index: clang/lib/Sema/SemaExpr.cpp
===================================================================
--- clang/lib/Sema/SemaExpr.cpp
+++ clang/lib/Sema/SemaExpr.cpp
@@ -17263,18 +17263,17 @@
 
 
 /// Capture the given variable in the captured region.
-static bool captureInCapturedRegion(CapturedRegionScopeInfo *RSI,
-                                    VarDecl *Var,
-                                    SourceLocation Loc,
-                                    const bool BuildAndDiagnose,
-                                    QualType &CaptureType,
-                                    QualType &DeclRefType,
-                                    const bool RefersToCapturedVariable,
-                                    Sema &S, bool Invalid) {
+static bool captureInCapturedRegion(
+    CapturedRegionScopeInfo *RSI, VarDecl *Var, SourceLocation Loc,
+    const bool BuildAndDiagnose, QualType &CaptureType, QualType &DeclRefType,
+    const bool RefersToCapturedVariable, Sema::TryCaptureKind Kind,
+    bool IsTopScope, Sema &S, bool Invalid) {
   // By default, capture variables by reference.
   bool ByRef = true;
-  // Using an LValue reference type is consistent with Lambdas (see below).
-  if (S.getLangOpts().OpenMP && RSI->CapRegionKind == CR_OpenMP) {
+  if (IsTopScope && Kind != Sema::TryCapture_Implicit) {
+    ByRef = (Kind == Sema::TryCapture_ExplicitByRef);
+  } else if (S.getLangOpts().OpenMP && RSI->CapRegionKind == CR_OpenMP) {
+    // Using an LValue reference type is consistent with Lambdas (see below).
     if (S.isOpenMPCapturedDecl(Var)) {
       bool HasConst = DeclRefType.isConstQualified();
       DeclRefType = DeclRefType.getUnqualifiedType();
@@ -17620,9 +17619,9 @@
                                DeclRefType, Nested, *this, Invalid);
       Nested = true;
     } else if (CapturedRegionScopeInfo *RSI = dyn_cast<CapturedRegionScopeInfo>(CSI)) {
-      Invalid = !captureInCapturedRegion(RSI, Var, ExprLoc, BuildAndDiagnose,
-                                         CaptureType, DeclRefType, Nested,
-                                         *this, Invalid);
+      Invalid = !captureInCapturedRegion(
+          RSI, Var, ExprLoc, BuildAndDiagnose, CaptureType, DeclRefType, Nested,
+          Kind, /*IsTopScope*/ I == N - 1, *this, Invalid);
       Nested = true;
     } else {
       LambdaScopeInfo *LSI = cast<LambdaScopeInfo>(CSI);
Index: clang/lib/Parse/ParseOpenMP.cpp
===================================================================
--- clang/lib/Parse/ParseOpenMP.cpp
+++ clang/lib/Parse/ParseOpenMP.cpp
@@ -2529,7 +2529,15 @@
       // the captured region. Code elsewhere assumes that any FunctionScopeInfo
       // should have at least one compound statement scope within it.
       ParsingOpenMPDirectiveRAII NormalScope(*this, /*Value=*/false);
-      AssociatedStmt = (Sema::CompoundScopeRAII(Actions), ParseStatement());
+      {
+        Sema::CompoundScopeRAII Scope(Actions);
+        AssociatedStmt = ParseStatement();
+
+        if (AssociatedStmt.isUsable() && isOpenMPLoopDirective(DKind) &&
+            getLangOpts().OpenMPIRBuilder)
+          AssociatedStmt =
+              Actions.ActOnOpenMPCanonicalLoop(AssociatedStmt.get());
+      }
       AssociatedStmt = Actions.ActOnOpenMPRegionEnd(AssociatedStmt, Clauses);
     } else if (DKind == OMPD_target_update || DKind == OMPD_target_enter_data ||
                DKind == OMPD_target_exit_data) {
Index: clang/lib/CodeGen/CodeGenFunction.h
===================================================================
--- clang/lib/CodeGen/CodeGenFunction.h
+++ clang/lib/CodeGen/CodeGenFunction.h
@@ -50,6 +50,7 @@
 class SwitchInst;
 class Twine;
 class Value;
+class CanonicalLoopInfo;
 }
 
 namespace clang {
@@ -276,6 +277,18 @@
   // because of jumps.
   VarBypassDetector Bypasses;
 
+  /// List of recently emitted OMPCanonicalLoops.
+  ///
+  /// Since OMPCanonicalLoops are nested inside other statements (in particular
+  /// CapturedStmt generated by OMPExecutableDirective and non-perfectly nested
+  /// loops), we cannot directly call OMPEmitOMPCanonicalLoop and receive its
+  /// llvm::CanonicalLoopInfo. Instead, we call EmitStmt and any
+  /// OMPEmitOMPCanonicalLoop called by it will add its CanonicalLoopInfo to
+  /// this stack when done. Entering a new loop requires clearing this list; it
+  /// either means we start parsing an new loop nest or sequential loop that are
+  /// not nested in each other.
+  SmallVector<llvm::CanonicalLoopInfo *, 4> OMPLoopNestStack;
+
   // CodeGen lambda for loops and support for ordered clause
   typedef llvm::function_ref<void(CodeGenFunction &, const OMPLoopDirective &,
                                   JumpDest)>
@@ -3498,6 +3511,18 @@
   static void EmitOMPTargetTeamsDistributeParallelForDeviceFunction(
       CodeGenModule &CGM, StringRef ParentName,
       const OMPTargetTeamsDistributeParallelForDirective &S);
+
+  /// Emit the Stmt \p S and return its topmost canonical loop, if any.
+  /// TODO: The \p Depth paramter is not yet implemented and must be 1. In the
+  /// future it is meant to be the number of loops expected in the loop nests
+  /// (usually specified by the "collapse" clause) that are collapsed to a
+  /// single loop by this function.
+  llvm::CanonicalLoopInfo *EmitOMPCollapsedCanonicalLoopNest(const Stmt *S,
+                                                             int Depth);
+
+  /// Emit an OMPCanonicalLoop using the OpenMPIRBuilder.
+  void EmitOMPCanonicalLoop(const OMPCanonicalLoop *S);
+
   /// Emit inner loop of the worksharing/simd construct.
   ///
   /// \param S Directive, for which the inner loop must be emitted.
Index: clang/lib/CodeGen/CGStmtOpenMP.cpp
===================================================================
--- clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -163,6 +163,8 @@
     for (unsigned Cnt = 0; Cnt < S.getCollapsedNumber(); ++Cnt) {
       Body = OMPLoopDirective::tryToFindNextInnerLoop(
           Body, /*TryImperfectlyNestedLoops=*/true);
+      if (auto *CanonLoop = dyn_cast<OMPCanonicalLoop>(Body))
+        Body = CanonLoop->getLoopStmt();
       if (auto *For = dyn_cast<ForStmt>(Body)) {
         Body = For->getBody();
       } else {
@@ -1771,6 +1773,8 @@
     return;
   }
   if (SimplifiedS == NextLoop) {
+    if (auto *CanonLoop = dyn_cast<OMPCanonicalLoop>(SimplifiedS))
+      SimplifiedS = CanonLoop->getLoopStmt();
     if (const auto *For = dyn_cast<ForStmt>(SimplifiedS)) {
       S = For->getBody();
     } else {
@@ -1858,6 +1862,117 @@
   BreakContinueStack.pop_back();
 }
 
+using EmittedClosureTy = std::pair<llvm::Function *, llvm::Value *>;
+
+/// Emit a captured statement and return the function as well as its captured
+/// closure context.
+static EmittedClosureTy emitCapturedStmtFunc(CodeGenFunction &ParentCGF,
+                                             const CapturedStmt *S) {
+  LValue CapStruct = ParentCGF.InitCapturedStruct(*S);
+  CodeGenFunction CGF(ParentCGF.CGM, true);
+  std::unique_ptr<CodeGenFunction::CGCapturedStmtInfo> CSI =
+      std::make_unique<CodeGenFunction::CGCapturedStmtInfo>(*S);
+  CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(CGF, CSI.get());
+  llvm::Function *F = CGF.GenerateCapturedStmtFunction(*S);
+
+  return {F, CapStruct.getPointer(ParentCGF)};
+}
+
+/// Emit a call to a previously captured closure.
+static llvm::CallInst *
+emitCapturedStmtCall(CodeGenFunction &ParentCGF, EmittedClosureTy Cap,
+                     llvm::ArrayRef<llvm::Value *> Args) {
+  // Append the closure context to the argument.
+  SmallVector<llvm::Value *> EffectiveArgs;
+  EffectiveArgs.reserve(Args.size() + 1);
+  llvm::append_range(EffectiveArgs, Args);
+  EffectiveArgs.push_back(Cap.second);
+
+  return ParentCGF.Builder.CreateCall(Cap.first, EffectiveArgs);
+}
+
+llvm::CanonicalLoopInfo *
+CodeGenFunction::EmitOMPCollapsedCanonicalLoopNest(const Stmt *S, int Depth) {
+  assert(Depth == 1 && "Nested loops with OpenMPIRBuilder not yet implemented");
+
+  EmitStmt(S);
+
+  assert(OMPLoopNestStack.size() >= Depth && "Found too few loops");
+  return OMPLoopNestStack.front();
+}
+
+void CodeGenFunction::EmitOMPCanonicalLoop(const OMPCanonicalLoop *S) {
+  const Stmt *SyntacticalLoop = S->getLoopStmt();
+  if (!getLangOpts().OpenMPIRBuilder) {
+    // Ignore if OpenMPIRBuilder is not enabled.
+    EmitStmt(SyntacticalLoop);
+    return;
+  }
+
+  LexicalScope ForScope(*this, S->getSourceRange());
+
+  // Emit init statements. The Distance/LoopVar funcs may reference variable
+  // declarations they contain.
+  const Stmt *BodyStmt;
+  if (auto *For = dyn_cast<ForStmt>(SyntacticalLoop)) {
+    if (const Stmt *InitStmt = For->getInit())
+      EmitStmt(InitStmt);
+    BodyStmt = For->getBody();
+  } else if (auto *RangeFor = dyn_cast<CXXForRangeStmt>(SyntacticalLoop)) {
+    if (const DeclStmt *RangeStmt = RangeFor->getRangeStmt())
+      EmitStmt(RangeStmt);
+    if (const DeclStmt *BeginStmt = RangeFor->getBeginStmt())
+      EmitStmt(BeginStmt);
+    if (const DeclStmt *EndStmt = RangeFor->getEndStmt())
+      EmitStmt(EndStmt);
+    if (const DeclStmt *LoopVarStmt = RangeFor->getLoopVarStmt())
+      EmitStmt(LoopVarStmt);
+    BodyStmt = RangeFor->getBody();
+  } else
+    llvm_unreachable("Expected for-stmt or range-based for-stmt");
+
+  // Emit closure for later use. By-value captures will be captured here.
+  const CapturedStmt *DistanceFunc = S->getDistanceFunc();
+  EmittedClosureTy DistanceClosure = emitCapturedStmtFunc(*this, DistanceFunc);
+  const CapturedStmt *LoopVarFunc = S->getLoopVarFunc();
+  EmittedClosureTy LoopVarClosure = emitCapturedStmtFunc(*this, LoopVarFunc);
+
+  // Call the distance function to get the number of iterations of the loop to
+  // come.
+  QualType LogicalTy = DistanceFunc->getCapturedDecl()
+                           ->getParam(0)
+                           ->getType()
+                           .getNonReferenceType();
+  Address CountAddr = CreateMemTemp(LogicalTy, ".count.addr");
+  emitCapturedStmtCall(*this, DistanceClosure, {CountAddr.getPointer()});
+  llvm::Value *DistVal = Builder.CreateLoad(CountAddr, ".count");
+
+  // Emit the loop structure.
+  llvm::OpenMPIRBuilder &OMPBuilder = CGM.getOpenMPRuntime().getOMPBuilder();
+  llvm::CanonicalLoopInfo *CL =
+      OMPBuilder.createCanonicalLoop(Builder, {}, DistVal);
+
+  // Emit the loop body: Convert the logical iteration number to the loop
+  // variable and emit the body.
+  Builder.restoreIP(CL->getBodyIP());
+  const DeclRefExpr *LoopVarRef = S->getLoopVarRef();
+  LValue LCVal = EmitLValue(LoopVarRef);
+  Address LoopVarAddress = LCVal.getAddress(*this);
+  emitCapturedStmtCall(*this, LoopVarClosure,
+                       {LoopVarAddress.getPointer(), CL->getIndVar()});
+  {
+    RunCleanupsScope BodyScope(*this);
+    EmitStmt(BodyStmt);
+  }
+
+  // Finish up the loop.
+  Builder.restoreIP(CL->getAfterIP());
+  ForScope.ForceCleanup();
+
+  // Remember the CanonicalLoopInfo for parent AST nodes consuming it.
+  OMPLoopNestStack.push_back(CL);
+}
+
 void CodeGenFunction::EmitOMPInnerLoop(
     const OMPExecutableDirective &S, bool RequiresCleanup, const Expr *LoopCond,
     const Expr *IncExpr,
@@ -1875,6 +1990,7 @@
   const CapturedStmt *ICS = OMPED.getInnermostCapturedStmt();
   const Stmt *SS = ICS->getCapturedStmt();
   const AttributedStmt *AS = dyn_cast_or_null<AttributedStmt>(SS);
+  OMPLoopNestStack.clear();
   if (AS)
     LoopStack.push(CondBlock, CGM.getContext(), CGM.getCodeGenOpts(),
                    AS->getAttrs(), SourceLocToDebugLoc(R.getBegin()),
@@ -2424,6 +2540,7 @@
   llvm::BasicBlock *CondBlock = createBasicBlock("omp.dispatch.cond");
   EmitBlock(CondBlock);
   const SourceRange R = S.getSourceRange();
+  OMPLoopNestStack.clear();
   LoopStack.push(CondBlock, SourceLocToDebugLoc(R.getBegin()),
                  SourceLocToDebugLoc(R.getEnd()));
 
@@ -2507,6 +2624,7 @@
   }
 
   EmitBranch(CondBlock);
+  OMPLoopNestStack.clear();
   LoopStack.pop();
   // Emit the fall-through block.
   EmitBlock(LoopExit.getBlock());
@@ -3351,8 +3469,24 @@
 
 void CodeGenFunction::EmitOMPForDirective(const OMPForDirective &S) {
   bool HasLastprivates = false;
-  auto &&CodeGen = [&S, &HasLastprivates](CodeGenFunction &CGF,
-                                          PrePostActionTy &) {
+  auto &&CodeGen = [this, &S, &HasLastprivates](CodeGenFunction &CGF,
+                                                PrePostActionTy &) {
+    // Use the OpenMPIRBuilder if enabled.
+    if (CGM.getLangOpts().OpenMPIRBuilder) {
+      // Emit the associated statement and get its loop representation.
+      const Stmt *Inner = S.getRawStmt();
+      llvm::CanonicalLoopInfo *CLI =
+          EmitOMPCollapsedCanonicalLoopNest(Inner, 1);
+
+      bool NeedsBarrer = !S.getSingleClause<OMPNowaitClause>();
+      llvm::OpenMPIRBuilder &OMPBuilder =
+          CGM.getOpenMPRuntime().getOMPBuilder();
+      llvm::OpenMPIRBuilder::InsertPointTy AllocaIP(
+          AllocaInsertPt->getParent(), AllocaInsertPt->getIterator());
+      OMPBuilder.createStaticWorkshareLoop(Builder, CLI, AllocaIP, NeedsBarrer);
+      return;
+    }
+
     HasLastprivates = emitWorksharingDirective(CGF, S, S.hasCancel());
   };
   {
@@ -3363,9 +3497,11 @@
                                                 S.hasCancel());
   }
 
-  // Emit an implicit barrier at the end.
-  if (!S.getSingleClause<OMPNowaitClause>() || HasLastprivates)
-    CGM.getOpenMPRuntime().emitBarrierCall(*this, S.getBeginLoc(), OMPD_for);
+  if (!CGM.getLangOpts().OpenMPIRBuilder) {
+    // Emit an implicit barrier at the end.
+    if (!S.getSingleClause<OMPNowaitClause>() || HasLastprivates)
+      CGM.getOpenMPRuntime().emitBarrierCall(*this, S.getBeginLoc(), OMPD_for);
+  }
   // Check for outer lastprivate conditional update.
   checkForLastprivateConditionalUpdate(*this, S);
 }
Index: clang/lib/CodeGen/CGStmt.cpp
===================================================================
--- clang/lib/CodeGen/CGStmt.cpp
+++ clang/lib/CodeGen/CGStmt.cpp
@@ -194,6 +194,9 @@
   case Stmt::SEHTryStmtClass:
     EmitSEHTryStmt(cast<SEHTryStmt>(*S));
     break;
+  case Stmt::OMPCanonicalLoopClass:
+    EmitOMPCanonicalLoop(cast<OMPCanonicalLoop>(S));
+    break;
   case Stmt::OMPParallelDirectiveClass:
     EmitOMPParallelDirective(cast<OMPParallelDirective>(*S));
     break;
Index: clang/lib/AST/StmtProfile.cpp
===================================================================
--- clang/lib/AST/StmtProfile.cpp
+++ clang/lib/AST/StmtProfile.cpp
@@ -848,6 +848,10 @@
       P.Visit(*I);
 }
 
+void StmtProfiler::VisitOMPCanonicalLoop(const OMPCanonicalLoop *L) {
+  VisitStmt(L);
+}
+
 void StmtProfiler::VisitOMPLoopDirective(const OMPLoopDirective *S) {
   VisitOMPExecutableDirective(S);
 }
Index: clang/lib/AST/StmtPrinter.cpp
===================================================================
--- clang/lib/AST/StmtPrinter.cpp
+++ clang/lib/AST/StmtPrinter.cpp
@@ -636,6 +636,10 @@
 //  OpenMP directives printing methods
 //===----------------------------------------------------------------------===//
 
+void StmtPrinter::VisitOMPCanonicalLoop(OMPCanonicalLoop *Node) {
+  PrintStmt(Node->getLoopStmt());
+}
+
 void StmtPrinter::PrintOMPExecutableDirective(OMPExecutableDirective *S,
                                               bool ForceNoStmt) {
   OMPClausePrinter Printer(OS, Policy);
Index: clang/lib/AST/StmtOpenMP.cpp
===================================================================
--- clang/lib/AST/StmtOpenMP.cpp
+++ clang/lib/AST/StmtOpenMP.cpp
@@ -91,6 +91,8 @@
         for (Stmt *S : CS->body()) {
           if (!S)
             continue;
+          if (auto *CanonLoop = dyn_cast<OMPCanonicalLoop>(S))
+            S = CanonLoop->getLoopStmt();
           if (isa<ForStmt>(S) || isa<CXXForRangeStmt>(S)) {
             // Only single loop construct is allowed.
             if (CurStmt) {
@@ -121,6 +123,8 @@
 Stmt *OMPLoopDirective::getBody() {
   // This relies on the loop form is already checked by Sema.
   Stmt *Body = Data->getRawStmt()->IgnoreContainers();
+  if (auto *CanonLoop = dyn_cast<OMPCanonicalLoop>(Body))
+    Body = CanonLoop->getLoopStmt();
   if (auto *For = dyn_cast<ForStmt>(Body)) {
     Body = For->getBody();
   } else {
@@ -130,6 +134,8 @@
   }
   for (unsigned Cnt = 1; Cnt < CollapsedNum; ++Cnt) {
     Body = tryToFindNextInnerLoop(Body, /*TryImperfectlyNestedLoops=*/true);
+    if (auto *CanonLoop = dyn_cast<OMPCanonicalLoop>(Body))
+      Body = CanonLoop->getLoopStmt();
     if (auto *For = dyn_cast<ForStmt>(Body)) {
       Body = For->getBody();
     } else {
Index: clang/lib/AST/Stmt.cpp
===================================================================
--- clang/lib/AST/Stmt.cpp
+++ clang/lib/AST/Stmt.cpp
@@ -1266,13 +1266,6 @@
     break;
   case VCK_ByCopy:
     assert(Var && "capturing by copy must have a variable!");
-    assert(
-        (Var->getType()->isScalarType() || (Var->getType()->isReferenceType() &&
-                                            Var->getType()
-                                                ->castAs<ReferenceType>()
-                                                ->getPointeeType()
-                                                ->isScalarType())) &&
-        "captures by copy are expected to have a scalar type!");
     break;
   case VCK_VLAType:
     assert(!Var &&
Index: clang/include/clang/Serialization/ASTBitCodes.h
===================================================================
--- clang/include/clang/Serialization/ASTBitCodes.h
+++ clang/include/clang/Serialization/ASTBitCodes.h
@@ -1830,21 +1830,21 @@
       /// A CXXBoolLiteralExpr record.
       EXPR_CXX_BOOL_LITERAL,
 
-      EXPR_CXX_NULL_PTR_LITERAL,  // CXXNullPtrLiteralExpr
-      EXPR_CXX_TYPEID_EXPR,       // CXXTypeidExpr (of expr).
-      EXPR_CXX_TYPEID_TYPE,       // CXXTypeidExpr (of type).
-      EXPR_CXX_THIS,              // CXXThisExpr
-      EXPR_CXX_THROW,             // CXXThrowExpr
-      EXPR_CXX_DEFAULT_ARG,       // CXXDefaultArgExpr
-      EXPR_CXX_DEFAULT_INIT,      // CXXDefaultInitExpr
-      EXPR_CXX_BIND_TEMPORARY,    // CXXBindTemporaryExpr
+      EXPR_CXX_NULL_PTR_LITERAL, // CXXNullPtrLiteralExpr
+      EXPR_CXX_TYPEID_EXPR,      // CXXTypeidExpr (of expr).
+      EXPR_CXX_TYPEID_TYPE,      // CXXTypeidExpr (of type).
+      EXPR_CXX_THIS,             // CXXThisExpr
+      EXPR_CXX_THROW,            // CXXThrowExpr
+      EXPR_CXX_DEFAULT_ARG,      // CXXDefaultArgExpr
+      EXPR_CXX_DEFAULT_INIT,     // CXXDefaultInitExpr
+      EXPR_CXX_BIND_TEMPORARY,   // CXXBindTemporaryExpr
 
       EXPR_CXX_SCALAR_VALUE_INIT, // CXXScalarValueInitExpr
       EXPR_CXX_NEW,               // CXXNewExpr
       EXPR_CXX_DELETE,            // CXXDeleteExpr
       EXPR_CXX_PSEUDO_DESTRUCTOR, // CXXPseudoDestructorExpr
 
-      EXPR_EXPR_WITH_CLEANUPS,    // ExprWithCleanups
+      EXPR_EXPR_WITH_CLEANUPS, // ExprWithCleanups
 
       EXPR_CXX_DEPENDENT_SCOPE_MEMBER,   // CXXDependentScopeMemberExpr
       EXPR_CXX_DEPENDENT_SCOPE_DECL_REF, // DependentScopeDeclRefExpr
@@ -1852,41 +1852,42 @@
       EXPR_CXX_UNRESOLVED_MEMBER,        // UnresolvedMemberExpr
       EXPR_CXX_UNRESOLVED_LOOKUP,        // UnresolvedLookupExpr
 
-      EXPR_CXX_EXPRESSION_TRAIT,  // ExpressionTraitExpr
-      EXPR_CXX_NOEXCEPT,          // CXXNoexceptExpr
+      EXPR_CXX_EXPRESSION_TRAIT, // ExpressionTraitExpr
+      EXPR_CXX_NOEXCEPT,         // CXXNoexceptExpr
 
-      EXPR_OPAQUE_VALUE,          // OpaqueValueExpr
-      EXPR_BINARY_CONDITIONAL_OPERATOR,  // BinaryConditionalOperator
-      EXPR_TYPE_TRAIT,            // TypeTraitExpr
-      EXPR_ARRAY_TYPE_TRAIT,      // ArrayTypeTraitIntExpr
+      EXPR_OPAQUE_VALUE,                // OpaqueValueExpr
+      EXPR_BINARY_CONDITIONAL_OPERATOR, // BinaryConditionalOperator
+      EXPR_TYPE_TRAIT,                  // TypeTraitExpr
+      EXPR_ARRAY_TYPE_TRAIT,            // ArrayTypeTraitIntExpr
 
-      EXPR_PACK_EXPANSION,        // PackExpansionExpr
-      EXPR_SIZEOF_PACK,           // SizeOfPackExpr
-      EXPR_SUBST_NON_TYPE_TEMPLATE_PARM, // SubstNonTypeTemplateParmExpr
-      EXPR_SUBST_NON_TYPE_TEMPLATE_PARM_PACK,// SubstNonTypeTemplateParmPackExpr
-      EXPR_FUNCTION_PARM_PACK,    // FunctionParmPackExpr
-      EXPR_MATERIALIZE_TEMPORARY, // MaterializeTemporaryExpr
-      EXPR_CXX_FOLD,              // CXXFoldExpr
-      EXPR_CONCEPT_SPECIALIZATION,// ConceptSpecializationExpr
-      EXPR_REQUIRES,              // RequiresExpr
+      EXPR_PACK_EXPANSION,                    // PackExpansionExpr
+      EXPR_SIZEOF_PACK,                       // SizeOfPackExpr
+      EXPR_SUBST_NON_TYPE_TEMPLATE_PARM,      // SubstNonTypeTemplateParmExpr
+      EXPR_SUBST_NON_TYPE_TEMPLATE_PARM_PACK, // SubstNonTypeTemplateParmPackExpr
+      EXPR_FUNCTION_PARM_PACK,                // FunctionParmPackExpr
+      EXPR_MATERIALIZE_TEMPORARY,             // MaterializeTemporaryExpr
+      EXPR_CXX_FOLD,                          // CXXFoldExpr
+      EXPR_CONCEPT_SPECIALIZATION,            // ConceptSpecializationExpr
+      EXPR_REQUIRES,                          // RequiresExpr
 
       // CUDA
-      EXPR_CUDA_KERNEL_CALL,       // CUDAKernelCallExpr
+      EXPR_CUDA_KERNEL_CALL, // CUDAKernelCallExpr
 
       // OpenCL
-      EXPR_ASTYPE,                 // AsTypeExpr
+      EXPR_ASTYPE, // AsTypeExpr
 
       // Microsoft
-      EXPR_CXX_PROPERTY_REF_EXPR, // MSPropertyRefExpr
+      EXPR_CXX_PROPERTY_REF_EXPR,       // MSPropertyRefExpr
       EXPR_CXX_PROPERTY_SUBSCRIPT_EXPR, // MSPropertySubscriptExpr
-      EXPR_CXX_UUIDOF_EXPR,       // CXXUuidofExpr (of expr).
-      EXPR_CXX_UUIDOF_TYPE,       // CXXUuidofExpr (of type).
-      STMT_SEH_LEAVE,             // SEHLeaveStmt
-      STMT_SEH_EXCEPT,            // SEHExceptStmt
-      STMT_SEH_FINALLY,           // SEHFinallyStmt
-      STMT_SEH_TRY,               // SEHTryStmt
+      EXPR_CXX_UUIDOF_EXPR,             // CXXUuidofExpr (of expr).
+      EXPR_CXX_UUIDOF_TYPE,             // CXXUuidofExpr (of type).
+      STMT_SEH_LEAVE,                   // SEHLeaveStmt
+      STMT_SEH_EXCEPT,                  // SEHExceptStmt
+      STMT_SEH_FINALLY,                 // SEHFinallyStmt
+      STMT_SEH_TRY,                     // SEHTryStmt
 
       // OpenMP directives
+      STMT_OMP_CANONICAL_LOOP,
       STMT_OMP_PARALLEL_DIRECTIVE,
       STMT_OMP_SIMD_DIRECTIVE,
       STMT_OMP_FOR_DIRECTIVE,
@@ -1946,10 +1947,10 @@
       EXPR_OMP_ITERATOR,
 
       // ARC
-      EXPR_OBJC_BRIDGED_CAST,     // ObjCBridgedCastExpr
+      EXPR_OBJC_BRIDGED_CAST, // ObjCBridgedCastExpr
 
-      STMT_MS_DEPENDENT_EXISTS,   // MSDependentExistsStmt
-      EXPR_LAMBDA,                // LambdaExpr
+      STMT_MS_DEPENDENT_EXISTS, // MSDependentExistsStmt
+      EXPR_LAMBDA,              // LambdaExpr
       STMT_COROUTINE_BODY,
       STMT_CORETURN,
       EXPR_COAWAIT,
Index: clang/include/clang/Sema/Sema.h
===================================================================
--- clang/include/clang/Sema/Sema.h
+++ clang/include/clang/Sema/Sema.h
@@ -10480,6 +10480,11 @@
 
   /// Initialization of captured region for OpenMP region.
   void ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope);
+
+  /// Called for syntactical loops (ForStmt for CXXRangeForStmt) associated to
+  /// an OpenMP loop directive.
+  StmtResult ActOnOpenMPCanonicalLoop(Stmt *AStmt);
+
   /// End of OpenMP region.
   ///
   /// \param S Statement associated with the current OpenMP region.
Index: clang/include/clang/Basic/StmtNodes.td
===================================================================
--- clang/include/clang/Basic/StmtNodes.td
+++ clang/include/clang/Basic/StmtNodes.td
@@ -216,6 +216,7 @@
 def AsTypeExpr : StmtNode<Expr>;
 
 // OpenMP Directives.
+def OMPCanonicalLoop : StmtNode<Stmt>;
 def OMPExecutableDirective : StmtNode<Stmt, 1>;
 def OMPLoopDirective : StmtNode<OMPExecutableDirective, 1>;
 def OMPParallelDirective : StmtNode<OMPExecutableDirective>;
Index: clang/include/clang/AST/StmtOpenMP.h
===================================================================
--- clang/include/clang/AST/StmtOpenMP.h
+++ clang/include/clang/AST/StmtOpenMP.h
@@ -28,6 +28,191 @@
 // AST classes for directives.
 //===----------------------------------------------------------------------===//
 
+/// Representation of an OpenMP canonical loop.
+///
+/// An OpenMP canonical loop is a for-statement or range-based for-statement
+/// with additional requirements that ensure that the number of iterations is
+/// known before entering the loop and allow skipping to an arbitrary iteration.
+/// The OMPCanonicalLoop AST node wraps a ForStmt or CXXRangeForStmt that is
+/// known for fulfill OpenMP's loop requirements.
+///
+/// There are three different kinds of iteration variables for different
+/// purposes:
+/// * Loop variable: The user-accessible variable with different value for each
+///   iteration.
+/// * Loop counter: The variable used to identify a loop iterations; for
+///   range-based for-statement, this is the hidden iterator '__begin'. For
+///   other loops, it is identical to the loop variable. Must be a random-access
+///   iterator or integer type.
+/// * Logical iteration counter: Normalized loop counter starting at 0 and
+///   incrementing by one at each iterations. Allows abstracting over the type
+///   of the loop counter and is always an unsigned integer type appropriate to
+///   represent the range of the loop counter variable.
+///
+/// This AST node provides two captured statements:
+/// * The distance function which computes the number of iterations.
+/// * The loop variable function that computes the loop variable when given a
+///   logical iteration number.
+///
+/// These captured statements provide the link between C/C++ semantics and the
+/// logical iteration counters used by the OpenMPIRBuilder which is
+/// language-agnostic and therefore does not know e.g. how to advance a
+/// random-access iterator. The OpenMPIRBuilder will use this information to
+/// convert the loop into simd-, workshare-, distribute-, taskloop etc. For
+/// compatibility with the non-OpenMPIRBuilder codegen path, an OMPCanonicalLoop
+/// can itself also be wrapped into the CapturedStmts of an OMPLoopDirective and
+/// skipped when searching for the associated syntactical loop.
+///
+/// Example:
+/// <code>
+///   std::vector<std::string> Container{1,2,3};
+///   for (std::string Str : Container)
+///      Body(Str);
+/// </code>
+/// which is syntactic sugar for approximately:
+/// <code>
+///   auto &&__range = Container;
+///   auto __begin = std::begin(__range);
+///   auto __end = std::end(__range);
+///   for (; __begin != __end; ++__begin) {
+///     std::String Str = *__begin;
+///     Body(Str);
+///   }
+/// </code>
+/// In this example, the loop variable is `Str`, the loop counter is `__begin`
+/// of type `std::vector<std::string>::iterator` and the logical iteration
+/// number type is `size_t` (unsigned version of
+/// `std::vector<std::string>::iterator::difference_type` aka `ptrdiff_t`).
+/// Therefore, the distance function will be
+/// <code>
+///   [&](size_t &Result) { Result = __end - __begin; }
+/// </code>
+///  and the loop variable function is
+/// <code>
+///   [&,__begin](std::vector<std::string>::iterator &Result, size_t Logical) {
+///   Result = __begin + Logical; }
+/// </code>
+class OMPCanonicalLoop : public Stmt {
+  friend class ASTStmtReader;
+  friend class ASTStmtWriter;
+
+  /// Children of this AST node.
+  enum {
+    LOOPY_STMT,
+    DISTANCE_FUNC,
+    LOOPVAR_FUNC,
+    LOOPVAR_REF,
+    LastSubStmt = LOOPVAR_REF
+  };
+
+private:
+  /// This AST node's children.
+  Stmt *SubStmts[LastSubStmt + 1] = {};
+
+  OMPCanonicalLoop() : Stmt(StmtClass::OMPCanonicalLoopClass) {}
+
+public:
+  /// Create a new OMPCanonicalLoop.
+  static OMPCanonicalLoop *create(const ASTContext &Ctx, Stmt *LoopStmt,
+                                  CapturedStmt *DistanceFunc,
+                                  CapturedStmt *LoopVarFunc,
+                                  DeclRefExpr *LoopVarRef) {
+    OMPCanonicalLoop *S = new (Ctx) OMPCanonicalLoop();
+    S->setLoopStmt(LoopStmt);
+    S->setDistanceFunc(DistanceFunc);
+    S->setLoopVarFunc(LoopVarFunc);
+    S->setLoopVarRef(LoopVarRef);
+    return S;
+  }
+
+  /// Create an empty OMPCanonicalLoop for deserialization.
+  static OMPCanonicalLoop *createEmpty(const ASTContext &Ctx) {
+    return new (Ctx) OMPCanonicalLoop();
+  }
+
+  static bool classof(const Stmt *S) {
+    return S->getStmtClass() == StmtClass::OMPCanonicalLoopClass;
+  }
+
+  SourceLocation getBeginLoc() const { return getLoopStmt()->getBeginLoc(); }
+  SourceLocation getEndLoc() const { return getLoopStmt()->getEndLoc(); }
+
+  /// Return this AST node's children.
+  /// @{
+  child_range children() {
+    return child_range(&SubStmts[0], &SubStmts[0] + LastSubStmt + 1);
+  }
+  const_child_range children() const {
+    return const_child_range(&SubStmts[0], &SubStmts[0] + LastSubStmt + 1);
+  }
+  /// @}
+
+  /// The wrapped syntactic loop statement (ForStmt or CXXForRangeStmt).
+  /// @{
+  Stmt *getLoopStmt() { return SubStmts[LOOPY_STMT]; }
+  const Stmt *getLoopStmt() const { return SubStmts[LOOPY_STMT]; }
+  void setLoopStmt(Stmt *S) {
+    assert((isa<ForStmt>(S) || isa<CXXForRangeStmt>(S)) &&
+           "Canonical loop must be a for loop (range-based or otherwise)");
+    SubStmts[LOOPY_STMT] = S;
+  }
+  /// @}
+
+  /// The function that computes the number of loop iterations. Can be evaluated
+  /// before entering the loop but after the syntactical loop's init
+  /// statement(s).
+  ///
+  /// Function signature: void(LogicalTy &Result)
+  /// Any values necessary to compute the distance are captures of the closure.
+  /// @{
+  CapturedStmt *getDistanceFunc() {
+    return cast<CapturedStmt>(SubStmts[DISTANCE_FUNC]);
+  }
+  const CapturedStmt *getDistanceFunc() const {
+    return cast<CapturedStmt>(SubStmts[DISTANCE_FUNC]);
+  }
+  void setDistanceFunc(CapturedStmt *S) {
+    assert(S && "Expected non-null captured statement");
+    SubStmts[DISTANCE_FUNC] = S;
+  }
+  /// @}
+
+  /// The function that compute the loop variable from a logical iteration
+  /// counter. Can be evaluated as first statement in the loop.
+  ///
+  /// Function signature: void(LoopVarTy &Result, LogicalTy Number)
+  /// Ayn other values required to compute the loop variable (such as start
+  /// value, step size) are captured by the closure. In particular, the initial
+  /// value of loop counter is captured by value to be unaffected by previous
+  /// iterations.
+  /// @{
+  CapturedStmt *getLoopVarFunc() {
+    return cast<CapturedStmt>(SubStmts[LOOPVAR_FUNC]);
+  }
+  const CapturedStmt *getLoopVarFunc() const {
+    return cast<CapturedStmt>(SubStmts[LOOPVAR_FUNC]);
+  }
+  void setLoopVarFunc(CapturedStmt *S) {
+    assert(S && "Expected non-null captured statement");
+    SubStmts[LOOPVAR_FUNC] = S;
+  }
+  /// @}
+
+  /// Reference to the loop variable as accessed in the loop body.
+  /// @{
+  DeclRefExpr *getLoopVarRef() {
+    return cast<DeclRefExpr>(SubStmts[LOOPVAR_REF]);
+  }
+  const DeclRefExpr *getLoopVarRef() const {
+    return cast<DeclRefExpr>(SubStmts[LOOPVAR_REF]);
+  }
+  void setLoopVarRef(DeclRefExpr *E) {
+    assert(E && "Expected non-null loop variable");
+    SubStmts[LOOPVAR_REF] = E;
+  }
+  /// @}
+};
+
 /// This is a basic class for representing single OpenMP executable
 /// directive.
 ///
Index: clang/include/clang/AST/RecursiveASTVisitor.h
===================================================================
--- clang/include/clang/AST/RecursiveASTVisitor.h
+++ clang/include/clang/AST/RecursiveASTVisitor.h
@@ -2777,6 +2777,14 @@
   return true;
 }
 
+DEF_TRAVERSE_STMT(OMPCanonicalLoop, {
+  if (!getDerived().shouldVisitImplicitCode()) {
+    // Visit only the syntactical loop.
+    TRY_TO(TraverseStmt(S->getLoopStmt()));
+    ShouldVisitChildren = false;
+  }
+})
+
 template <typename Derived>
 bool
 RecursiveASTVisitor<Derived>::TraverseOMPLoopDirective(OMPLoopDirective *S) {
Index: clang/include/clang-c/Index.h
===================================================================
--- clang/include/clang-c/Index.h
+++ clang/include/clang-c/Index.h
@@ -2568,7 +2568,11 @@
    */
   CXCursor_OMPScanDirective = 287,
 
-  CXCursor_LastStmt = CXCursor_OMPScanDirective,
+  /** OpenMP canonical loop.
+   */
+  CXCursor_OMPCanonicalLoop = 288,
+
+  CXCursor_LastStmt = CXCursor_OMPCanonicalLoop,
 
   /**
    * Cursor that represents the translation unit itself.
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to