jz10 created this revision.
jz10 added a reviewer: jdoerfert.
Herald added subscribers: guansong, yaxunl.
Herald added a project: All.
jz10 requested review of this revision.
Herald added subscribers: openmp-commits, cfe-commits, sstefan1.
Herald added projects: clang, OpenMP.

We introduced the implementation of supporting asynchronous routines with 
depend objects specified in Version 5.1 of the OpenMP Application Programming 
Interface. In brief, these routines omp_target_memcpy_async and 
omp_target_memcpy_rect_async perform asynchronous (nonblocking) memory copies 
between any
combination of host and device pointers. The basic idea is to create the 
implicit tasks to carry the memory copy calls and handle dependencies specified 
by depend objects. The implicit tasks are executed via hidden helper thread in 
OpenMP runtime.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D136103

Files:
  clang/docs/ReleaseNotes.rst
  openmp/libomptarget/src/api.cpp
  openmp/libomptarget/src/exports
  openmp/libomptarget/src/private.h

Index: openmp/libomptarget/src/private.h
===================================================================
--- openmp/libomptarget/src/private.h
+++ openmp/libomptarget/src/private.h
@@ -98,7 +98,47 @@
  * We maintain the same data structure for compatibility.
  */
 typedef int kmp_int32;
+typedef int64_t kmp_int64;
 typedef intptr_t kmp_intptr_t;
+
+typedef void * omp_depend_t;
+struct kmp_task;
+typedef kmp_int32 (* kmp_routine_entry_t)( kmp_int32, struct kmp_task * );
+typedef struct kmp_task {
+  void *              shareds;
+  kmp_routine_entry_t routine;
+  kmp_int32           part_id;
+} kmp_task_t;
+
+typedef struct kmp_tasking_flags { /* Total struct must be exactly 32 bits */
+  /* Compiler flags */ /* Total compiler flags must be 16 bits */
+  unsigned tiedness : 1; /* task is either tied (1) or untied (0) */
+  unsigned final : 1; /* task is final(1) so execute immediately */
+  unsigned merged_if0 : 1; /* no __kmpc_task_{begin/complete}_if0 calls in if0 code path */
+  unsigned destructors_thunk : 1; /* set if the compiler creates a thunk to invoke destructors from the runtime */
+  unsigned proxy : 1; /* task is a proxy task (it will be executed outside the context of the RTL) */
+  unsigned priority_specified : 1; /* set if the compiler provides priority setting for the task */
+  unsigned detachable : 1; /* 1 == can detach */
+  unsigned hidden_helper : 1; /* 1 == hidden helper task */
+  unsigned reserved : 8; /* reserved for compiler use */
+
+  /* Library flags */ /* Total library flags must be 16 bits */
+  unsigned tasktype : 1; /* task is either explicit(1) or implicit (0) */
+  unsigned task_serial : 1; // task is executed immediately (1) or deferred (0) 
+  unsigned tasking_ser : 1; // all tasks in team are either executed immediately 
+  // (1) or may be deferred (0) 
+  unsigned team_serial : 1; // entire team is serial (1) [1 thread] or parallel    
+  // (0) [>= 2 threads]
+  /* If either team_serial or tasking_ser is set, task team may be NULL */
+  /* Task State Flags: */
+  unsigned started : 1; /* 1==started, 0==not started     */
+  unsigned executing : 1; /* 1==executing, 0==not executing */
+  unsigned complete : 1; /* 1==complete, 0==not complete   */
+  unsigned freed : 1; /* 1==freed, 0==allocated        */
+  unsigned native : 1; /* 1==gcc-compiled task, 0==intel */
+  unsigned reserved31 : 7; /* reserved for library use */
+} kmp_tasking_flags_t;
+  
 // Compiler sends us this info:
 typedef struct kmp_depend_info {
   kmp_intptr_t base_addr;
@@ -117,6 +157,96 @@
                           kmp_depend_info_t *dep_list, kmp_int32 ndeps_noalias,
                           kmp_depend_info_t *noalias_dep_list)
     __attribute__((weak));
+
+kmp_task_t* __kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 flags,
+                                  size_t sizeof_kmp_task_t, size_t sizeof_shareds,
+                                  kmp_routine_entry_t task_entry)
+  __attribute__((weak));
+
+kmp_task_t* __kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 flags,
+                                         size_t sizeof_kmp_task_t, size_t sizeof_shareds,
+                                         kmp_routine_entry_t task_entry, kmp_int64 device_id)
+  __attribute__((weak));
+
+void __kmpc_proxy_task_completed_ooo (kmp_task_t *ptask) __attribute__((weak));
+kmp_int32 __kmpc_omp_task_with_deps (ident_t *loc_ref, kmp_int32 gtid, kmp_task_t * new_task,
+                                     kmp_int32 ndeps, kmp_depend_info_t *dep_list,
+                                     kmp_int32 ndeps_noalias, kmp_depend_info_t *noalias_dep_list)
+  __attribute__((weak));
+
+class TargetMemcpyArgsTy {
+public:
+  TargetMemcpyArgsTy(void *Dst_, const void *Src_, size_t Length_,
+                     size_t DstOffset_, size_t SrcOffset_, int DstDevice_, int SrcDevice_,
+                     int Depobj_count, omp_depend_t* Depobj_list) :
+    Dst(Dst_), Src(Src_), Length(Length_), DstOffset(DstOffset_), SrcOffset(SrcOffset_),
+    DstDevice(DstDevice_), SrcDevice(SrcDevice_), Depobjs(0) {
+    if (Depobj_count > 0) {
+      Depobjs = new kmp_depend_info_t[Depobj_count];
+      for (int i = 0; i < Depobj_count; i ++) {
+        omp_depend_t depobj = Depobj_list[i];
+        Depobjs[i] = * ((kmp_depend_info_t* )depobj);
+      }
+    }
+  };
+
+    ~TargetMemcpyArgsTy() {
+    if (Depobjs != 0)
+      delete Depobjs;
+  }
+
+  void *Dst;
+  const void *Src;
+  size_t Length;
+  size_t DstOffset;
+  size_t SrcOffset;
+  int DstDevice;
+  int SrcDevice;
+
+  // The buffer for depend objects
+  kmp_depend_info_t* Depobjs;
+};
+
+class TargetMemcpyRectArgsTy {
+public:
+  TargetMemcpyRectArgsTy(void *Dst_, const void *Src_, size_t ElementSize_, int NumDims_,
+                          const size_t* Volume_, const size_t* DstOffsets_, const size_t* SrcOffsets_,
+                          const size_t* DstDimensions_, const size_t* SrcDimensions_,
+                          int DstDevice_, int SrcDevice_,
+                          int Depobj_count, omp_depend_t* Depobj_list) :
+    Dst(Dst_), Src(Src_), ElementSize(ElementSize_), NumDims(NumDims_), Volume(Volume_),
+    DstOffsets(DstOffsets_), SrcOffsets(SrcOffsets_), DstDimensions(DstDimensions_),
+    SrcDimensions(SrcDimensions_), DstDevice(DstDevice_), SrcDevice(SrcDevice_), Depobjs(0) {
+    if (Depobj_count > 0) {
+      Depobjs = new kmp_depend_info_t[Depobj_count];
+      for (int i = 0; i < Depobj_count; i ++) {
+        omp_depend_t depobj = Depobj_list[i];
+        Depobjs[i] = * ((kmp_depend_info_t* )depobj);
+      }
+    }
+  };
+
+  ~TargetMemcpyRectArgsTy() {
+    if (Depobjs != 0)
+      delete Depobjs;
+  }
+
+  void *Dst;
+  const void *Src;
+  size_t ElementSize;
+  int NumDims;
+  const size_t *Volume;
+  const size_t *DstOffsets;
+  const size_t *SrcOffsets;
+  const size_t *DstDimensions;
+  const size_t *SrcDimensions;
+  int DstDevice;
+  int SrcDevice;
+
+  // The buffer for depend objects   
+  kmp_depend_info_t* Depobjs;
+};
+    
 #ifdef __cplusplus
 }
 #endif
Index: openmp/libomptarget/src/exports
===================================================================
--- openmp/libomptarget/src/exports
+++ openmp/libomptarget/src/exports
@@ -38,6 +38,8 @@
     omp_target_is_present;
     omp_target_memcpy;
     omp_target_memcpy_rect;
+    omp_target_memcpy_async;
+    omp_target_memcpy_rect_async;
     omp_target_associate_ptr;
     omp_target_disassociate_ptr;
     llvm_omp_target_alloc_host;
Index: openmp/libomptarget/src/api.cpp
===================================================================
--- openmp/libomptarget/src/api.cpp
+++ openmp/libomptarget/src/api.cpp
@@ -200,6 +200,65 @@
   return Rc;
 }
 
+// The helper function that calls omp_target_memcpy 
+int __kmpc_target_memcpy_async_helper(kmp_int32 gtid, kmp_task_t *task) {
+  if (task == 0)
+    return -1;
+
+  TargetMemcpyArgsTy* args = (TargetMemcpyArgsTy *)task->shareds;
+
+  if (args == 0)
+    return -1;
+
+  // Call blocked version  
+  omp_target_memcpy(args->Dst, args->Src, args->Length, args->DstOffset, args->SrcOffset,
+                    args->DstDevice, args->SrcDevice);
+
+  return 0;
+}
+
+EXTERN int
+omp_target_memcpy_async(void *Dst, const void *Src, size_t Length,
+                        size_t DstOffset, size_t SrcOffset, int DstDevice, int SrcDevice,
+                        int Depobj_count, omp_depend_t *Depobj_list) {
+  TIMESCOPE();
+  DP("Call to omp_target_memcpy_async, dst device %d, src device %d, "
+     "dst addr " DPxMOD ", src addr " DPxMOD ", dst offset %zu, "
+     "src offset %zu, length %zu\n",
+     DstDevice, SrcDevice, DPxPTR(Dst), DPxPTR(Src), DstOffset, SrcOffset,
+     Length);
+
+  // Check the source and dest address    
+  if (Dst == 0 || Src == 0)
+    return 5;
+
+  // Create task  
+  int (* fn)(kmp_int32, kmp_task_t*) = &__kmpc_target_memcpy_async_helper;
+  int errsz = sizeof(kmp_task_t);
+  int errhr = 0;
+  int gtid = __kmpc_global_thread_num(NULL);
+
+  // Setup the hidden helper flags; 
+  kmp_int32 flags = 0;
+  kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags;
+  input_flags->hidden_helper = 1;
+
+  // Alloc helper task 
+  kmp_task_t *ptr = __kmpc_omp_target_task_alloc(NULL, gtid, flags, errsz, errhr, fn, -1);
+
+  // Create task object  
+  TargetMemcpyArgsTy* args_ = new TargetMemcpyArgsTy(Dst, Src, Length, DstOffset, SrcOffset,
+                                                     DstDevice, SrcDevice, Depobj_count, Depobj_list);
+  ptr->shareds = args_;
+
+  int Rc = OFFLOAD_SUCCESS;
+  // omp_target_memcpy(Dst, Src, Length, DstOffset, SrcOffset, DstDevice, SrcDevice);
+  __kmpc_omp_task_with_deps(NULL, gtid, ptr, Depobj_count, args_->Depobjs, 0, NULL);
+
+  DP("omp_target_memcpy_async returns %d\n", Rc);
+  return Rc;
+}
+
 EXTERN int
 omp_target_memcpy_rect(void *Dst, const void *Src, size_t ElementSize,
                        int NumDims, const size_t *Volume,
@@ -260,6 +319,73 @@
   return Rc;
 }
 
+// The helper function that calls omp_target_memcpy_rect  
+int __kmpc_target_memcpy_rect_async_helper(kmp_int32 gtid, kmp_task_t *task) {
+  if (task == 0)
+    return -1;
+
+  TargetMemcpyRectArgsTy* args = (TargetMemcpyRectArgsTy *)task->shareds;
+
+  if (args == 0)
+    return -1;
+
+  // Call blocked version
+  omp_target_memcpy_rect(args->Dst, args->Src, args->ElementSize, args->NumDims, args->Volume,
+                         args->DstOffsets, args->SrcOffsets, args->DstDimensions, args->SrcDimensions,
+                         args->DstDevice, args->SrcDevice);
+
+  return 0;
+}
+
+EXTERN int
+omp_target_memcpy_rect_async(void *Dst, const void *Src, size_t ElementSize,
+                             int NumDims, const size_t *Volume,
+		             const size_t *DstOffsets, const size_t *SrcOffsets,
+                             const size_t *DstDimensions, const size_t *SrcDimensions,
+                             int DstDevice, int SrcDevice,
+                             int Depobj_count, omp_depend_t *Depobj_list) {
+  TIMESCOPE();
+  DP("Call to omp_target_memcpy_rect_async, dst device %d, src device %d, "
+     "dst addr " DPxMOD ", src addr " DPxMOD ", dst offsets " DPxMOD ", "
+     "src offsets " DPxMOD ", dst dims " DPxMOD ", src dims " DPxMOD ", "
+     "volume " DPxMOD ", element size %zu, num_dims %d\n",
+     DstDevice, SrcDevice, DPxPTR(Dst), DPxPTR(Src), DPxPTR(DstOffsets),
+     DPxPTR(SrcOffsets), DPxPTR(DstDimensions), DPxPTR(SrcDimensions),
+     DPxPTR(Volume), ElementSize, NumDims);
+
+  // Check the source and dest address 
+  if (Dst == 0 || Src == 0)
+    return 5;
+
+  // Create task 
+  int (* fn)(kmp_int32, kmp_task_t*) = &__kmpc_target_memcpy_rect_async_helper;
+  int errsz = sizeof(kmp_task_t);
+  int errhr = 0;
+  int gtid = __kmpc_global_thread_num(NULL);
+
+  // Setup the hidden helper flags
+  kmp_int32 flags = 0;
+  kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags;
+  input_flags->hidden_helper = 1;
+
+  // Alloc helper task   
+  kmp_task_t *ptr = __kmpc_omp_target_task_alloc(NULL, gtid, flags, errsz, errhr, fn, -1);
+
+  // Create task object
+  TargetMemcpyRectArgsTy* args_ = new TargetMemcpyRectArgsTy(Dst, Src, ElementSize, NumDims, Volume,
+							     DstOffsets, SrcOffsets,
+							     DstDimensions, SrcDimensions,
+							     DstDevice, SrcDevice,
+							     Depobj_count, Depobj_list);
+  ptr->shareds = args_;
+
+  int Rc = OFFLOAD_SUCCESS;
+   __kmpc_omp_task_with_deps(NULL, gtid, ptr, Depobj_count, args_->Depobjs, 0, NULL);
+
+  DP("omp_target_memcpy_rect_async returns %d\n", Rc);
+  return Rc;
+}
+
 EXTERN int omp_target_associate_ptr(const void *HostPtr, const void *DevicePtr,
                                     size_t Size, size_t DeviceOffset,
                                     int DeviceNum) {
Index: clang/docs/ReleaseNotes.rst
===================================================================
--- clang/docs/ReleaseNotes.rst
+++ clang/docs/ReleaseNotes.rst
@@ -248,6 +248,9 @@
   not satisfied in the event of an instantiation failures in a requires expression's
   parameter list. We previously handled this correctly in a constraint evaluation
   context, but not in a requires clause evaluated as a boolean.
+- Address the thread identification problems in coroutines.
+  `Issue 47177 <https://github.com/llvm/llvm-project/issues/47177>`_
+  `Issue 47179 <https://github.com/llvm/llvm-project/issues/47179>`_
 
 Improvements to Clang's diagnostics
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to