[PATCH] D113359: [Libomptarget][WIP] Introduce VGPU Plugin

Atmn Patel via Phabricator via cfe-commits Sat, 06 Nov 2021 19:41:18 -0700

atmnpatel created this revision.
atmnpatel added reviewers: jdoerfert, tianshilei1992, JonChesterfield.
Herald added subscribers: ormris, dexonsmith, pengfei, hiraditya, mgorny.
atmnpatel requested review of this revision.
Herald added subscribers: llvm-commits, openmp-commits, cfe-commits, sstefan1.
Herald added projects: clang, OpenMP, LLVM.


This patch introduces a virtual GPU (x86) plugin. This allows for the
emulation of the GPU environment on the host. This re-uses the same
execution model, compilation paths, runtimes as a physical GPU. The
number of threads, warps, and CTAs are set through the environment
variables `VGPU_{NUM_THREADS,NUM_WARPS,WARPS_PER_CTA}` respectively.

Known Bugs (hence WIP):

- In the rebase from LLVM 12, larger applications started segfaulting. Small 
programs still work with this patch.
- The virtual GPU should be able to execute kernels asynchronously using the 
streams - but there is an unknown lifetime issue around the `ffi_call` that 
prevents the removal of the await after the `scheduleAsync` call.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D113359

Files:
  clang/lib/Basic/Targets/X86.h
  clang/lib/CodeGen/CGOpenMPRuntimeVirtualGPU.cpp
  clang/lib/CodeGen/CGOpenMPRuntimeVirtualGPU.h
  clang/lib/CodeGen/CMakeLists.txt
  clang/lib/CodeGen/CodeGenModule.cpp
  clang/lib/Driver/ToolChains/Gnu.cpp
  clang/lib/Frontend/CompilerInvocation.cpp
  llvm/include/llvm/ADT/Triple.h
  llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h
  llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
  llvm/lib/Support/Triple.cpp
  llvm/lib/Transforms/IPO/OpenMPOpt.cpp
  llvm/utils/gn/secondary/clang/lib/CodeGen/BUILD.gn
  openmp/CMakeLists.txt
  openmp/libomptarget/DeviceRTL/CMakeLists.txt
  openmp/libomptarget/DeviceRTL/include/Interface.h
  openmp/libomptarget/DeviceRTL/src/Kernel.cpp
  openmp/libomptarget/DeviceRTL/src/Mapping.cpp
  openmp/libomptarget/DeviceRTL/src/Misc.cpp
  openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
  openmp/libomptarget/DeviceRTL/src/Utils.cpp
  openmp/libomptarget/include/DeviceEnvironment.h
  openmp/libomptarget/plugins/CMakeLists.txt
  openmp/libomptarget/plugins/vgpu/CMakeLists.txt
  openmp/libomptarget/plugins/vgpu/src/DeviceEnvironment.cpp
  openmp/libomptarget/plugins/vgpu/src/DeviceEnvironmentImpl.h
  openmp/libomptarget/plugins/vgpu/src/rtl.cpp
  openmp/libomptarget/src/rtl.cpp

Index: openmp/libomptarget/src/rtl.cpp
===================================================================
--- openmp/libomptarget/src/rtl.cpp
+++ openmp/libomptarget/src/rtl.cpp
@@ -34,6 +34,7 @@
     /* SX-Aurora VE target  */ "libomptarget.rtl.ve.so",
     /* AMDGPU target        */ "libomptarget.rtl.amdgpu.so",
     /* Remote target        */ "libomptarget.rtl.rpc.so",
+    /* Virtual GPU target   */ "libomptarget.rtl.vgpu.so",
 };
 
 PluginManager *PM;
@@ -83,7 +84,7 @@
   // is correct and if they are supporting any devices.
   for (auto *Name : RTLNames) {
     DP("Loading library '%s'...\n", Name);
-    void *dynlib_handle = dlopen(Name, RTLD_NOW);
+    void *dynlib_handle = dlopen(Name, RTLD_NOW | RTLD_GLOBAL);
 
     if (!dynlib_handle) {
       // Library does not exist or cannot be found.
Index: openmp/libomptarget/plugins/vgpu/src/rtl.cpp
===================================================================
--- /dev/null
+++ openmp/libomptarget/plugins/vgpu/src/rtl.cpp
@@ -0,0 +1,623 @@
+//===------RTLs/vgpu/src/rtl.cpp - Target RTLs Implementation ----- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// RTL for virtual (x86) GPU
+//
+//===----------------------------------------------------------------------===//
+
+#include <barrier>
+#include <cassert>
+#include <cmath>
+#include <condition_variable>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <dlfcn.h>
+#include <ffi.h>
+#include <functional>
+#include <gelf.h>
+#include <link.h>
+#include <list>
+#include <memory>
+#include <mutex>
+#include <queue>
+#include <thread>
+#include <vector>
+
+#include "Debug.h"
+#include "DeviceEnvironment.h"
+#include "DeviceEnvironmentImpl.h"
+#include "omptarget.h"
+#include "omptargetplugin.h"
+
+#ifndef TARGET_NAME
+#define TARGET_NAME Generic ELF - 64bit
+#endif
+#define DEBUG_PREFIX "TARGET " GETNAME(TARGET_NAME) " RTL"
+
+#ifndef TARGET_ELF_ID
+#define TARGET_ELF_ID 0
+#endif
+
+#include "elf_common.h"
+
+#define NUMBER_OF_DEVICES 1
+#define OFFLOADSECTIONNAME "omp_offloading_entries"
+
+#define DEBUG false
+
+/// Array of Dynamic libraries loaded for this target.
+struct DynLibTy {
+  char *FileName;
+  void *Handle;
+};
+
+/// Keep entries table per device.
+struct FuncOrGblEntryTy {
+  __tgt_target_table Table;
+};
+
+thread_local ThreadEnvironmentTy *ThreadEnvironment;
+
+/// Class containing all the device information.
+class RTLDeviceInfoTy {
+  std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries;
+
+public:
+  std::list<DynLibTy> DynLibs;
+
+  // Record entry point associated with device.
+  void createOffloadTable(int32_t device_id, __tgt_offload_entry *begin,
+                          __tgt_offload_entry *end) {
+    assert(device_id < (int32_t)FuncGblEntries.size() &&
+           "Unexpected device id!");
+    FuncGblEntries[device_id].emplace_back();
+    FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
+
+    E.Table.EntriesBegin = begin;
+    E.Table.EntriesEnd = end;
+  }
+
+  // Return true if the entry is associated with device.
+  bool findOffloadEntry(int32_t device_id, void *addr) {
+    assert(device_id < (int32_t)FuncGblEntries.size() &&
+           "Unexpected device id!");
+    FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
+
+    for (__tgt_offload_entry *i = E.Table.EntriesBegin, *e = E.Table.EntriesEnd;
+         i < e; ++i) {
+      if (i->addr == addr)
+        return true;
+    }
+
+    return false;
+  }
+
+  // Return the pointer to the target entries table.
+  __tgt_target_table *getOffloadEntriesTable(int32_t device_id) {
+    assert(device_id < (int32_t)FuncGblEntries.size() &&
+           "Unexpected device id!");
+    FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
+
+    return &E.Table;
+  }
+
+  RTLDeviceInfoTy(int32_t num_devices) { FuncGblEntries.resize(num_devices); }
+
+  ~RTLDeviceInfoTy() {
+    // Close dynamic libraries
+    for (auto &lib : DynLibs) {
+      if (lib.Handle) {
+        dlclose(lib.Handle);
+        remove(lib.FileName);
+      }
+    }
+  }
+};
+
+static RTLDeviceInfoTy DeviceInfo(NUMBER_OF_DEVICES);
+
+std::vector<CTAEnvironmentTy *> CTAEnvironments;
+std::vector<WarpEnvironmentTy *> WarpEnvironments;
+
+struct VGPUTy {
+  struct KernelTy {
+    ffi_cif *Cif;
+    std::function<void(void)> Kernel;
+    int NumTeams;
+
+    KernelTy(ffi_cif *Cif, std::function<void(void)> Kernel, int NumTeams)
+        : Cif(Cif), Kernel(Kernel), NumTeams(NumTeams) {}
+  };
+
+  struct VGPUStreamTy {
+    std::queue<KernelTy> Kernels;
+    std::mutex Mtx;
+
+    void emplace(ffi_cif *Cif, std::function<void(void)> F, int NumTeams) {
+      std::lock_guard Guard(Mtx);
+      Kernels.emplace(Cif, F, NumTeams);
+    }
+
+    KernelTy front() {
+      std::lock_guard Guard(Mtx);
+      return Kernels.front();
+    }
+
+    void pop() {
+      std::lock_guard Guard(Mtx);
+      Kernels.pop();
+    }
+
+    bool empty() {
+      std::lock_guard Guard(Mtx);
+      return Kernels.empty();
+    }
+  };
+
+  struct AsyncInfoQueueTy {
+    std::deque<__tgt_async_info *> Streams;
+    std::mutex Mtx;
+
+    bool empty() {
+      std::lock_guard Guard(Mtx);
+      return Streams.empty();
+    }
+
+    __tgt_async_info *front() {
+      std::lock_guard Guard(Mtx);
+      return Streams.front();
+    }
+
+    void pop() {
+      std::lock_guard Guard(Mtx);
+      Streams.pop_front();
+    }
+
+    void emplace(__tgt_async_info *AsyncInfo) {
+      std::lock_guard Guard(Mtx);
+      Streams.emplace_back(AsyncInfo);
+    }
+  } ExecutionQueue;
+
+  VGPUStreamTy *getStream(__tgt_async_info *AsyncInfo) {
+    assert(AsyncInfo != nullptr && "async_info ptr was null");
+
+    if (!AsyncInfo->Queue)
+      AsyncInfo->Queue = new VGPUStreamTy();
+
+    return reinterpret_cast<VGPUStreamTy *>(AsyncInfo->Queue);
+  }
+
+  std::atomic<bool> Running;
+  std::vector<std::thread> Threads;
+  int WarpsPerCTA;
+  int NumCTAs;
+
+  std::unique_ptr<std::barrier<std::function<void(void)>>> Barrier;
+  std::condition_variable WorkAvailable;
+  std::mutex WorkDoneMtx;
+  std::condition_variable WorkDone;
+
+  VGPUTy(int NumThreads = -1, int ThreadsPerWarp = -1, int WarpsPerCTA = -1)
+      : Running(true) {
+    if (const char *Env = std::getenv("VGPU_NUM_THREADS"))
+      NumThreads = std::stoi(Env);
+    if (const char *Env = std::getenv("VGPU_THREADS_PER_WARP"))
+      ThreadsPerWarp = std::stoi(Env);
+    if (const char *Env = std::getenv("VGPU_WARPS_PER_CTA"))
+      WarpsPerCTA = std::stoi(Env);
+
+    if (NumThreads == -1)
+      NumThreads = std::thread::hardware_concurrency();
+    if (ThreadsPerWarp == -1)
+      ThreadsPerWarp = NumThreads;
+    if (WarpsPerCTA == -1)
+      WarpsPerCTA = 1;
+
+    NumCTAs = NumThreads / (ThreadsPerWarp * WarpsPerCTA);
+
+    // printf("NumThreads: %d, ThreadsPerWarp: %d, WarpsPerCTA: %d\n",
+    // NumThreads,
+    //       ThreadsPerWarp, WarpsPerCTA);
+
+    assert(NumThreads % ThreadsPerWarp == 0 && NumThreads % WarpsPerCTA == 0 &&
+           "Invalid VGPU Config");
+
+    Barrier = std::make_unique<std::barrier<std::function<void(void)>>>(
+        NumThreads, []() {});
+
+    Threads.reserve(NumThreads);
+
+    auto GlobalThreadIdx = 0;
+    for (auto CTAIdx = 0; CTAIdx < NumCTAs; CTAIdx++) {
+      auto *CTAEnv =
+          new CTAEnvironmentTy(CTAIdx, NumThreads / NumCTAs, NumCTAs);
+      for (auto WarpIdx = 0; WarpIdx < WarpsPerCTA; WarpIdx++) {
+        auto *WarpEnv = new WarpEnvironmentTy(WarpIdx, ThreadsPerWarp);
+        for (auto ThreadIdx = 0; ThreadIdx < ThreadsPerWarp; ThreadIdx++) {
+          Threads.emplace_back(
+              [this, ThreadIdx, GlobalThreadIdx, CTAEnv, WarpEnv]() {
+                ThreadEnvironment =
+                    new ThreadEnvironmentTy(ThreadIdx, WarpEnv, CTAEnv);
+                std::function<void(void)> Kernel;
+                while (Running) {
+                  {
+                    std::unique_lock<std::mutex> UniqueLock(ExecutionQueue.Mtx);
+
+                    WorkAvailable.wait(UniqueLock, [&]() {
+                      if (!Running) {
+                        return true;
+                      }
+                      bool IsEmpty = ExecutionQueue.Streams.empty();
+
+                      return !IsEmpty;
+                    });
+                  }
+
+                  if (ExecutionQueue.empty()) {
+                    continue;
+                  }
+
+                  while (!ExecutionQueue.empty()) {
+                    auto *Stream = getStream(ExecutionQueue.front());
+                    while (!Stream->empty()) {
+                      auto KernelInfo = Stream->front();
+                      Kernel = KernelInfo.Kernel;
+
+                      const unsigned NumTeams = KernelInfo.NumTeams;
+                      unsigned TeamIdx = 0;
+                      while (TeamIdx < KernelInfo.NumTeams) {
+                        if (CTAEnv->getId() < KernelInfo.NumTeams) {
+                          ThreadEnvironment->setBlockEnv(
+                              new ThreadBlockEnvironmentTy(
+                                  TeamIdx + CTAEnv->getId(), NumTeams));
+                          Kernel();
+                          ThreadEnvironment->resetBlockEnv();
+                        }
+                        Barrier->arrive_and_wait();
+                        TeamIdx += NumCTAs;
+                      }
+
+                      if (GlobalThreadIdx == 0) {
+                        delete KernelInfo.Cif;
+                        Stream->pop();
+                      }
+
+                      Barrier->arrive_and_wait();
+                    }
+                    if (GlobalThreadIdx == 0) {
+                      ExecutionQueue.pop();
+                      WorkDone.notify_all();
+                    }
+                    Barrier->arrive_and_wait();
+                  }
+                }
+                delete ThreadEnvironment;
+              });
+          GlobalThreadIdx = (GlobalThreadIdx + 1) % NumThreads;
+        }
+        WarpEnvironments.push_back(WarpEnv);
+      }
+      CTAEnvironments.push_back(CTAEnv);
+    }
+  }
+
+  ~VGPUTy() {
+    awaitAll();
+
+    Running = false;
+    WorkAvailable.notify_all();
+
+    for (auto &Thread : Threads) {
+      if (Thread.joinable()) {
+        Thread.join();
+      }
+    }
+
+    for (auto *CTAEnv : CTAEnvironments)
+      delete CTAEnv;
+
+    for (auto *WarpEnv : WarpEnvironments)
+      delete WarpEnv;
+  }
+
+  void await(__tgt_async_info *AsyncInfo) {
+    std::unique_lock UniqueLock(getStream(AsyncInfo)->Mtx);
+    WorkDone.wait(UniqueLock,
+                  [&]() { return getStream(AsyncInfo)->Kernels.empty(); });
+  }
+
+  void awaitAll() {
+    while (!ExecutionQueue.empty()) {
+      await(ExecutionQueue.front());
+    }
+  }
+
+  void scheduleAsync(__tgt_async_info *AsyncInfo, ffi_cif *Cif,
+                     std::function<void(void)> F, int NumTeams) {
+    if (NumTeams == 0)
+      NumTeams = NumCTAs;
+    auto *Stream = getStream(AsyncInfo);
+    Stream->emplace(Cif, F, NumTeams);
+    ExecutionQueue.emplace(AsyncInfo);
+    WorkAvailable.notify_all();
+  }
+};
+
+VGPUTy VGPU;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
+// If we don't have a valid ELF ID we can just fail.
+#if TARGET_ELF_ID < 1
+  return 0;
+#else
+  return elf_check_machine(image, TARGET_ELF_ID);
+#endif
+}
+
+int32_t __tgt_rtl_number_of_devices() { return NUMBER_OF_DEVICES; }
+
+int32_t __tgt_rtl_init_device(int32_t device_id) { return OFFLOAD_SUCCESS; }
+
+__tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
+                                          __tgt_device_image *image) {
+
+  DP("Dev %d: load binary from " DPxMOD " image\n", device_id,
+     DPxPTR(image->ImageStart));
+
+  assert(device_id >= 0 && device_id < NUMBER_OF_DEVICES && "bad dev id");
+
+  size_t ImageSize = (size_t)image->ImageEnd - (size_t)image->ImageStart;
+  size_t NumEntries = (size_t)(image->EntriesEnd - image->EntriesBegin);
+  DP("Expecting to have %zd entries defined.\n", NumEntries);
+
+  // Is the library version incompatible with the header file?
+  if (elf_version(EV_CURRENT) == EV_NONE) {
+    DP("Incompatible ELF library!\n");
+    return NULL;
+  }
+
+  // Obtain elf handler
+  Elf *e = elf_memory((char *)image->ImageStart, ImageSize);
+  if (!e) {
+    DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1));
+    return NULL;
+  }
+
+  if (elf_kind(e) != ELF_K_ELF) {
+    DP("Invalid Elf kind!\n");
+    elf_end(e);
+    return NULL;
+  }
+
+  // Find the entries section offset
+  Elf_Scn *section = 0;
+  Elf64_Off entries_offset = 0;
+
+  size_t shstrndx;
+
+  if (elf_getshdrstrndx(e, &shstrndx)) {
+    DP("Unable to get ELF strings index!\n");
+    elf_end(e);
+    return NULL;
+  }
+
+  while ((section = elf_nextscn(e, section))) {
+    GElf_Shdr hdr;
+    gelf_getshdr(section, &hdr);
+
+    if (!strcmp(elf_strptr(e, shstrndx, hdr.sh_name), OFFLOADSECTIONNAME)) {
+      entries_offset = hdr.sh_addr;
+      break;
+    }
+  }
+
+  if (!entries_offset) {
+    DP("Entries Section Offset Not Found\n");
+    elf_end(e);
+    return NULL;
+  }
+
+  DP("Offset of entries section is (" DPxMOD ").\n", DPxPTR(entries_offset));
+
+  // load dynamic library and get the entry points. We use the dl library
+  // to do the loading of the library, but we could do it directly to avoid
+  // the dump to the temporary file.
+  //
+  // 1) Create tmp file with the library contents.
+  // 2) Use dlopen to load the file and dlsym to retrieve the symbols.
+  char tmp_name[] = "/tmp/tmpfile_XXXXXX";
+  int tmp_fd = mkstemp(tmp_name);
+
+  if (tmp_fd == -1) {
+    elf_end(e);
+    return NULL;
+  }
+
+  FILE *ftmp = fdopen(tmp_fd, "wb");
+
+  if (!ftmp) {
+    elf_end(e);
+    return NULL;
+  }
+
+  fwrite(image->ImageStart, ImageSize, 1, ftmp);
+  fclose(ftmp);
+
+  DynLibTy Lib = {tmp_name, dlopen(tmp_name, RTLD_NOW | RTLD_GLOBAL)};
+
+  if (!Lib.Handle) {
+    DP("Target library loading error: %s\n", dlerror());
+    elf_end(e);
+    return NULL;
+  }
+
+  DeviceInfo.DynLibs.push_back(Lib);
+
+  struct link_map *libInfo = (struct link_map *)Lib.Handle;
+
+  // The place where the entries info is loaded is the library base address
+  // plus the offset determined from the ELF file.
+  Elf64_Addr entries_addr = libInfo->l_addr + entries_offset;
+
+  DP("Pointer to first entry to be loaded is (" DPxMOD ").\n",
+     DPxPTR(entries_addr));
+
+  // Table of pointers to all the entries in the target.
+  __tgt_offload_entry *entries_table = (__tgt_offload_entry *)entries_addr;
+
+  __tgt_offload_entry *entries_begin = &entries_table[0];
+  __tgt_offload_entry *entries_end = entries_begin + NumEntries;
+
+  if (!entries_begin) {
+    DP("Can't obtain entries begin\n");
+    elf_end(e);
+    return NULL;
+  }
+
+  DP("Entries table range is (" DPxMOD ")->(" DPxMOD ")\n",
+     DPxPTR(entries_begin), DPxPTR(entries_end));
+  DeviceInfo.createOffloadTable(device_id, entries_begin, entries_end);
+
+  elf_end(e);
+
+  return DeviceInfo.getOffloadEntriesTable(device_id);
+}
+
+// Sample implementation of explicit memory allocator. For this plugin all
+// kinds are equivalent to each other.
+void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr,
+                           int32_t kind) {
+  void *ptr = NULL;
+
+  switch (kind) {
+  case TARGET_ALLOC_DEVICE:
+  case TARGET_ALLOC_HOST:
+  case TARGET_ALLOC_SHARED:
+  case TARGET_ALLOC_DEFAULT:
+    ptr = malloc(size);
+    break;
+  default:
+    REPORT("Invalid target data allocation kind");
+  }
+
+  return ptr;
+}
+
+int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
+                              int64_t size) {
+  VGPU.awaitAll();
+  memcpy(tgt_ptr, hst_ptr, size);
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
+                                int64_t size) {
+  VGPU.awaitAll();
+  memcpy(hst_ptr, tgt_ptr, size);
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
+  free(tgt_ptr);
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t __tgt_rtl_synchronize(int32_t device_id, __tgt_async_info *async_info) {
+  VGPU.await(async_info);
+  delete (VGPUTy::VGPUStreamTy *)async_info->Queue;
+  async_info->Queue = nullptr;
+  return 0;
+}
+
+int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
+                                         void **tgt_args,
+                                         ptrdiff_t *tgt_offsets,
+                                         int32_t arg_num, int32_t team_num,
+                                         int32_t thread_limit,
+                                         uint64_t loop_tripcount) {
+  __tgt_async_info AsyncInfo;
+  int rc = __tgt_rtl_run_target_team_region_async(
+      device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
+      thread_limit, loop_tripcount, &AsyncInfo);
+
+  if (rc != OFFLOAD_SUCCESS)
+    return OFFLOAD_FAIL;
+
+  return __tgt_rtl_synchronize(device_id, &AsyncInfo);
+}
+
+int32_t __tgt_rtl_run_target_team_region_async(
+    int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
+    ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
+    int32_t thread_limit, uint64_t loop_tripcount /*not used*/,
+    __tgt_async_info *async_info) {
+  ffi_cif *cif = new ffi_cif();
+
+  // All args are references.
+  std::shared_ptr<std::vector<ffi_type *>> args_types =
+      std::make_shared<std::vector<ffi_type *>>(arg_num, &ffi_type_pointer);
+  std::shared_ptr<std::vector<void *>> args =
+      std::make_shared<std::vector<void *>>(arg_num);
+  std::shared_ptr<std::vector<void *>> ptrs =
+      std::make_shared<std::vector<void *>>(arg_num);
+
+  for (int32_t i = 0; i < arg_num; ++i) {
+    (*ptrs)[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]);
+    (*args)[i] = &(*ptrs)[i];
+  }
+
+  ffi_status status = ffi_prep_cif(cif, FFI_DEFAULT_ABI, arg_num,
+                                   &ffi_type_void, &(*args_types)[0]);
+
+  assert(status == FFI_OK && "Unable to prepare target launch!");
+
+  if (status != FFI_OK)
+    return OFFLOAD_FAIL;
+
+  DP("Running entry point at " DPxMOD "...\n", DPxPTR(tgt_entry_ptr));
+
+  void (*entry)(void);
+  *((void **)&entry) = tgt_entry_ptr;
+
+  VGPU.scheduleAsync(
+      async_info, cif,
+      [&]() {
+        ffi_call(cif, entry, NULL, &(*args)[0]);
+        &(args_types);
+      },
+      team_num);
+  VGPU.await(async_info);
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
+                                    void **tgt_args, ptrdiff_t *tgt_offsets,
+                                    int32_t arg_num) {
+  return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args,
+                                          tgt_offsets, arg_num, 1, 1, 0);
+}
+
+int32_t __tgt_rtl_run_target_region_async(int32_t device_id,
+                                          void *tgt_entry_ptr, void **tgt_args,
+                                          ptrdiff_t *tgt_offsets,
+                                          int32_t arg_num,
+                                          __tgt_async_info *async_info) {
+  return __tgt_rtl_run_target_team_region_async(device_id, tgt_entry_ptr,
+                                                tgt_args, tgt_offsets, arg_num,
+                                                1, 1, 0, async_info);
+}
+
+#ifdef __cplusplus
+}
+#endif
Index: openmp/libomptarget/plugins/vgpu/src/DeviceEnvironmentImpl.h
===================================================================
--- /dev/null
+++ openmp/libomptarget/plugins/vgpu/src/DeviceEnvironmentImpl.h
@@ -0,0 +1,168 @@
+//===---- DeviceEnvironmentImpl.h - Virtual GPU device environment - C++ --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_VGPU_SRC_DEVICEENVIRONMENTIMPL_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_VGPU_SRC_DEVICEENVIRONMENTIMPL_H
+
+#include "DeviceEnvironment.h"
+#include <barrier>
+#include <cstdio>
+#include <functional>
+#include <map>
+#include <thread>
+#include <vector>
+
+class WarpEnvironmentTy {
+  const unsigned ID;
+  const unsigned NumThreads;
+
+  std::vector<int32_t> ShuffleBuffer;
+
+  std::barrier<std::function<void(void)>> Barrier;
+  std::barrier<std::function<void(void)>> ShuffleBarrier;
+  std::barrier<std::function<void(void)>> ShuffleDownBarrier;
+
+public:
+  WarpEnvironmentTy(unsigned ID, unsigned NumThreads)
+      : ID(ID), NumThreads(NumThreads), ShuffleBuffer(NumThreads),
+        Barrier(NumThreads, []() {}), ShuffleBarrier(NumThreads, []() {}),
+        ShuffleDownBarrier(NumThreads, []() {}) {}
+
+  unsigned getWarpId() const { return ID; }
+  int getNumThreads() const { return NumThreads; }
+
+  void sync() { Barrier.arrive_and_wait(); }
+  void writeShuffleBuffer(int32_t Var, unsigned LaneId) {
+    ShuffleBuffer[LaneId] = Var;
+  }
+
+  int32_t getShuffleBuffer(unsigned LaneId) { return ShuffleBuffer[LaneId]; }
+
+  void waitShuffleBarrier() { ShuffleBarrier.arrive_and_wait(); }
+
+  void waitShuffleDownBarrier() { ShuffleBarrier.arrive_and_wait(); }
+};
+
+class CTAEnvironmentTy {
+public:
+  unsigned ID;
+  unsigned NumThreads;
+  unsigned NumBlocks;
+
+  std::barrier<std::function<void(void)>> Barrier;
+  std::barrier<std::function<void(void)>> SyncThreads;
+  std::barrier<std::function<void(void)>> NamedBarrier;
+
+  CTAEnvironmentTy(unsigned ID, unsigned NumThreads, unsigned NumBlocks)
+      : ID(ID), NumThreads(NumThreads), NumBlocks(NumBlocks),
+        Barrier(NumThreads, []() {}), SyncThreads(NumThreads, []() {}),
+        NamedBarrier(NumThreads, []() {}) {}
+
+  unsigned getId() const { return ID; }
+  unsigned getNumThreads() const { return NumThreads; }
+
+  unsigned getNumBlocks() const { return NumBlocks; }
+
+  void fence() { Barrier.arrive_and_wait(); }
+  void syncThreads() { SyncThreads.arrive_and_wait(); }
+  void namedBarrier() { NamedBarrier.arrive_and_wait(); }
+};
+
+class ThreadBlockEnvironmentTy {
+  unsigned ID;
+  unsigned NumBlocks;
+
+public:
+  ThreadBlockEnvironmentTy(unsigned ID, unsigned NumBlocks)
+      : ID(ID), NumBlocks(NumBlocks) {}
+
+  unsigned getId() const { return ID; }
+  unsigned getNumBlocks() const { return NumBlocks; }
+};
+
+namespace VGPUImpl {
+class ThreadEnvironmentTy {
+  unsigned ThreadIdInWarp;
+  unsigned ThreadIdInBlock;
+  unsigned GlobalThreadIdx;
+
+  WarpEnvironmentTy *WarpEnvironment;
+  ThreadBlockEnvironmentTy *ThreadBlockEnvironment;
+  CTAEnvironmentTy *CTAEnvironment;
+
+public:
+  ThreadEnvironmentTy(unsigned ThreadId, WarpEnvironmentTy *WE,
+                      CTAEnvironmentTy *CTAE)
+      : ThreadIdInWarp(ThreadId),
+        ThreadIdInBlock(WE->getWarpId() * WE->getNumThreads() + ThreadId),
+        GlobalThreadIdx(CTAE->getId() * CTAE->getNumThreads() +
+                        ThreadIdInBlock),
+        WarpEnvironment(WE), CTAEnvironment(CTAE) {}
+
+  void setBlockEnv(ThreadBlockEnvironmentTy *TBE) {
+    ThreadBlockEnvironment = TBE;
+  }
+
+  void resetBlockEnv() {
+    delete ThreadBlockEnvironment;
+    ThreadBlockEnvironment = nullptr;
+  }
+
+  unsigned getThreadIdInWarp() const { return ThreadIdInWarp; }
+  unsigned getThreadIdInBlock() const { return ThreadIdInBlock; }
+  unsigned getGlobalThreadId() const { return GlobalThreadIdx; }
+
+  unsigned getBlockSize() const { return CTAEnvironment->getNumThreads(); }
+
+  unsigned getBlockId() const { return ThreadBlockEnvironment->getId(); }
+
+  unsigned getNumberOfBlocks() const {
+    return ThreadBlockEnvironment->getNumBlocks();
+  }
+  unsigned getKernelSize() const {}
+
+  // FIXME: This is wrong
+  LaneMaskTy getActiveMask() const { return ~0U; }
+
+  void fenceTeam() { CTAEnvironment->fence(); }
+  void syncWarp() { WarpEnvironment->sync(); }
+
+  int32_t shuffle(int32_t Var, uint64_t SrcLane) {
+    WarpEnvironment->waitShuffleBarrier();
+    WarpEnvironment->writeShuffleBuffer(Var, ThreadIdInWarp);
+    WarpEnvironment->waitShuffleBarrier();
+    Var = WarpEnvironment->getShuffleBuffer(ThreadIdInWarp);
+    return Var;
+  }
+
+  int32_t shuffleDown(int32_t Var, uint32_t Delta) {
+    WarpEnvironment->waitShuffleDownBarrier();
+    WarpEnvironment->writeShuffleBuffer(Var, ThreadIdInWarp);
+    WarpEnvironment->waitShuffleDownBarrier();
+    Var = WarpEnvironment->getShuffleBuffer((ThreadIdInWarp + Delta) %
+                                            getWarpSize());
+    return Var;
+  }
+
+  void namedBarrier(bool Generic) {
+    if (Generic) {
+      CTAEnvironment->namedBarrier();
+    } else {
+      CTAEnvironment->syncThreads();
+    }
+  }
+
+  void fenceKernel(int32_t MemoryOrder) {
+    std::atomic_thread_fence(static_cast<std::memory_order>(MemoryOrder));
+  }
+
+  unsigned getWarpSize() const { return WarpEnvironment->getNumThreads(); }
+};
+} // namespace VGPUImpl
+
+#endif // OPENMP_LIBOMPTARGET_PLUGINS_VGPU_SRC_DEVICEENVIRONMENTIMPL_H
Index: openmp/libomptarget/plugins/vgpu/src/DeviceEnvironment.cpp
===================================================================
--- /dev/null
+++ openmp/libomptarget/plugins/vgpu/src/DeviceEnvironment.cpp
@@ -0,0 +1,118 @@
+//===---- DeviceEnvironment.cpp - Virtual GPU Device Environment -- C++ ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of VGPU environment classes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DeviceEnvironment.h"
+#include "DeviceEnvironmentImpl.h"
+#include <barrier>
+#include <cstdint>
+#include <mutex>
+
+std::mutex AtomicIncLock;
+
+uint32_t VGPUImpl::atomicInc(uint32_t *Address, uint32_t Val, int Ordering) {
+  std::lock_guard G(AtomicIncLock);
+  uint32_t V = *Address;
+  if (V >= Val)
+    *Address = 0;
+  else
+    *Address += 1;
+  return V;
+}
+
+void VGPUImpl::setLock(uint32_t *Lock, uint32_t Unset, uint32_t Set,
+                       uint32_t OmpSpin, uint32_t BlockId,
+                       uint32_t(atomicCAS)(uint32_t *, uint32_t, uint32_t,
+                                           int)) {
+  // TODO: not sure spinning is a good idea here..
+  while (atomicCAS((uint32_t *)Lock, Unset, Set, __ATOMIC_SEQ_CST) != Unset) {
+    std::clock_t start = std::clock();
+    std::clock_t now;
+    for (;;) {
+      now = std::clock();
+      std::clock_t cycles =
+          now > start ? now - start : now + (0xffffffff - start);
+      if (cycles >= 1000 * BlockId) {
+        break;
+      }
+    }
+  } // wait for 0 to be the read value
+}
+
+extern thread_local ThreadEnvironmentTy *ThreadEnvironment;
+
+ThreadEnvironmentTy *getThreadEnvironment() { return ThreadEnvironment; }
+
+ThreadEnvironmentTy::ThreadEnvironmentTy(unsigned Id, WarpEnvironmentTy *WE,
+                                         CTAEnvironmentTy *CTAE)
+    : Impl(new VGPUImpl::ThreadEnvironmentTy(Id, WE, CTAE)) {}
+
+ThreadEnvironmentTy::~ThreadEnvironmentTy() { delete Impl; }
+
+void ThreadEnvironmentTy::fenceTeam() { Impl->fenceTeam(); }
+
+void ThreadEnvironmentTy::syncWarp() { Impl->syncWarp(); }
+
+unsigned ThreadEnvironmentTy::getThreadIdInWarp() const {
+  return Impl->getThreadIdInWarp();
+}
+
+unsigned ThreadEnvironmentTy::getThreadIdInBlock() const {
+  return Impl->getThreadIdInBlock();
+}
+
+unsigned ThreadEnvironmentTy::getGlobalThreadId() const {
+  return Impl->getGlobalThreadId();
+}
+
+unsigned ThreadEnvironmentTy::getBlockSize() const {
+  return Impl->getBlockSize();
+}
+
+unsigned ThreadEnvironmentTy::getKernelSize() const {
+  return Impl->getKernelSize();
+}
+
+unsigned ThreadEnvironmentTy::getBlockId() const { return Impl->getBlockId(); }
+
+unsigned ThreadEnvironmentTy::getNumberOfBlocks() const {
+  return Impl->getNumberOfBlocks();
+}
+
+LaneMaskTy ThreadEnvironmentTy::getActiveMask() const {
+  return Impl->getActiveMask();
+}
+
+int32_t ThreadEnvironmentTy::shuffle(int32_t Var, uint64_t SrcLane) {
+  return Impl->shuffle(Var, SrcLane);
+}
+
+int32_t ThreadEnvironmentTy::shuffleDown(int32_t Var, uint32_t Delta) {
+  return Impl->shuffleDown(Var, Delta);
+}
+
+void ThreadEnvironmentTy::fenceKernel(int32_t MemoryOrder) {
+  return Impl->fenceKernel(MemoryOrder);
+}
+
+void ThreadEnvironmentTy::namedBarrier(bool Generic) {
+  Impl->namedBarrier(Generic);
+}
+
+void ThreadEnvironmentTy::setBlockEnv(ThreadBlockEnvironmentTy *TBE) {
+  Impl->setBlockEnv(TBE);
+}
+
+void ThreadEnvironmentTy::resetBlockEnv() { Impl->resetBlockEnv(); }
+
+unsigned ThreadEnvironmentTy::getWarpSize() const {
+  return Impl->getWarpSize();
+}
Index: openmp/libomptarget/plugins/vgpu/CMakeLists.txt
===================================================================
--- /dev/null
+++ openmp/libomptarget/plugins/vgpu/CMakeLists.txt
@@ -0,0 +1,58 @@
+set(tmachine_name "vgpu")
+set(tmachine_libname "vgpu")
+set(tmachine_triple "x86_64-vgpu")
+set(elf_machine_id "62")
+
+if(LIBOMPTARGET_DEP_LIBELF_FOUND)
+    if(LIBOMPTARGET_DEP_LIBFFI_FOUND)
+
+        libomptarget_say("Building ${tmachine_name} offloading plugin.")
+
+        include_directories(${LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR})
+        include_directories(${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIR})
+        include_directories(${LIBOMPTARGET_INCLUDE_DIR})
+
+        # Define macro to be used as prefix of the runtime messages for this target.
+        add_definitions("-DTARGET_NAME=${tmachine_name}")
+
+        # Define macro with the ELF ID for this target.
+        add_definitions("-DTARGET_ELF_ID=${elf_machine_id}")
+
+        add_library("omptarget.rtl.${tmachine_libname}" SHARED
+                ${CMAKE_CURRENT_SOURCE_DIR}/src/rtl.cpp
+                ${CMAKE_CURRENT_SOURCE_DIR}/src/DeviceEnvironment.cpp)
+
+        # Install plugin under the lib destination folder.
+        install(TARGETS "omptarget.rtl.${tmachine_libname}"
+                LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}")
+
+        set_target_properties("omptarget.rtl.${tmachine_libname}" PROPERTIES CXX_STANDARD 20)
+        target_compile_options("omptarget.rtl.${tmachine_libname}" PRIVATE "-stdlib=libc++")
+
+        target_link_libraries(
+                "omptarget.rtl.${tmachine_libname}"
+                elf_common
+                ${LIBOMPTARGET_DEP_LIBFFI_LIBRARIES}
+                ${LIBOMPTARGET_DEP_LIBELF_LIBRARIES}
+                dl
+                ${OPENMP_PTHREAD_LIB}
+                "-rdynamic"
+                c++
+                #"-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports"
+                )
+
+        list(APPEND LIBOMPTARGET_TESTED_PLUGINS
+                "omptarget.rtl.${tmachine_libname}")
+
+        # Report to the parent scope that we are building a plugin.
+        set(LIBOMPTARGET_SYSTEM_TARGETS
+                "${LIBOMPTARGET_SYSTEM_TARGETS} ${tmachine_triple}" PARENT_SCOPE)
+        set(LIBOMPTARGET_TESTED_PLUGINS
+                "${LIBOMPTARGET_TESTED_PLUGINS}" PARENT_SCOPE)
+
+    else(LIBOMPTARGET_DEP_LIBFFI_FOUND)
+        libomptarget_say("Not building ${tmachine_name} offloading plugin: libffi dependency not found.")
+    endif(LIBOMPTARGET_DEP_LIBFFI_FOUND)
+else(LIBOMPTARGET_DEP_LIBELF_FOUND)
+    libomptarget_say("Not building ${tmachine_name} offloading plugin: libelf dependency not found.")
+endif(LIBOMPTARGET_DEP_LIBELF_FOUND)
Index: openmp/libomptarget/plugins/CMakeLists.txt
===================================================================
--- openmp/libomptarget/plugins/CMakeLists.txt
+++ openmp/libomptarget/plugins/CMakeLists.txt
@@ -75,6 +75,7 @@
 add_subdirectory(ppc64)
 add_subdirectory(ppc64le)
 add_subdirectory(ve)
+add_subdirectory(vgpu)
 add_subdirectory(x86_64)
 add_subdirectory(remote)
 
Index: openmp/libomptarget/include/DeviceEnvironment.h
===================================================================
--- openmp/libomptarget/include/DeviceEnvironment.h
+++ openmp/libomptarget/include/DeviceEnvironment.h
@@ -22,4 +22,64 @@
   uint32_t DynamicMemSize;
 };
 
+using LaneMaskTy = uint64_t;
+
+// Forward declaration
+class WarpEnvironmentTy;
+class ThreadBlockEnvironmentTy;
+class CTAEnvironmentTy;
+namespace VGPUImpl {
+class ThreadEnvironmentTy;
+void setLock(uint32_t *Lock, uint32_t Unset, uint32_t Set, uint32_t OmpSpin,
+             uint32_t BlockId,
+             uint32_t(atomicCAS)(uint32_t *, uint32_t, uint32_t, int));
+uint32_t atomicInc(uint32_t *Address, uint32_t Val, int Ordering);
+} // namespace VGPUImpl
+
+class ThreadEnvironmentTy {
+  VGPUImpl::ThreadEnvironmentTy *Impl;
+
+public:
+  ThreadEnvironmentTy(unsigned Id, WarpEnvironmentTy *WE,
+                      CTAEnvironmentTy *CTAE);
+
+  ~ThreadEnvironmentTy();
+
+  unsigned getThreadIdInWarp() const;
+
+  unsigned getThreadIdInBlock() const;
+
+  unsigned getGlobalThreadId() const;
+
+  unsigned getBlockSize() const;
+
+  unsigned getKernelSize() const;
+
+  unsigned getBlockId() const;
+
+  unsigned getNumberOfBlocks() const;
+
+  LaneMaskTy getActiveMask() const;
+
+  unsigned getWarpSize() const;
+
+  int32_t shuffle(int32_t Var, uint64_t SrcLane);
+
+  int32_t shuffleDown(int32_t Var, uint32_t Delta);
+
+  void fenceKernel(int32_t MemoryOrder);
+
+  void fenceTeam();
+
+  void syncWarp();
+
+  void namedBarrier(bool Generic);
+
+  void setBlockEnv(ThreadBlockEnvironmentTy *TBE);
+
+  void resetBlockEnv();
+};
+
+ThreadEnvironmentTy *getThreadEnvironment(void);
+
 #endif
Index: openmp/libomptarget/DeviceRTL/src/Utils.cpp
===================================================================
--- openmp/libomptarget/DeviceRTL/src/Utils.cpp
+++ openmp/libomptarget/DeviceRTL/src/Utils.cpp
@@ -45,6 +45,24 @@
 
 #pragma omp end declare variant
 
+/// Virtual GPU Implementation
+///
+///{
+#pragma omp begin declare variant match(                                       \
+    device = {arch(x86, x86_64)}, implementation = {extension(match_any)})
+
+void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) {
+  *LowBits = (uint32_t)(Val & static_cast<uint64_t>(0x00000000FFFFFFFF));
+  *HighBits =
+      (uint32_t)((Val & static_cast<uint64_t>(0xFFFFFFFF00000000)) >> 32);
+}
+
+uint64_t Pack(uint32_t LowBits, uint32_t HighBits) {
+  return (((uint64_t)HighBits) << 32) | (uint64_t)LowBits;
+}
+
+#pragma omp end declare variant
+
 /// NVPTX Implementation
 ///
 ///{
@@ -109,6 +127,26 @@
 #pragma omp end declare variant
 } // namespace impl
 
+/// Virtual GPU Implementation
+///
+///{
+#pragma omp begin declare variant match(                                       \
+    device = {arch(x86, x86_64)}, implementation = {extension(match_any)})
+
+#include "DeviceEnvironment.h"
+namespace impl {
+
+int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane) {
+  return getThreadEnvironment()->shuffle(Var, SrcLane);
+}
+
+int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width) {
+  return getThreadEnvironment()->shuffleDown(Var, Delta);
+}
+
+} // namespace impl
+#pragma omp end declare variant
+
 uint64_t utils::pack(uint32_t LowBits, uint32_t HighBits) {
   return impl::Pack(LowBits, HighBits);
 }
Index: openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
===================================================================
--- openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
+++ openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
@@ -214,6 +214,71 @@
 
 } // namespace impl
 
+/// Virtual GPU Implementation
+///
+///{
+#pragma omp begin declare variant match(                                       \
+    device = {arch(x86, x86_64)}, implementation = {extension(match_any)})
+
+#include "DeviceEnvironment.h"
+namespace impl {
+
+uint32_t atomicInc(uint32_t *Address, uint32_t Val, int Ordering) {
+  return VGPUImpl::atomicInc(Address, Val, Ordering);
+}
+
+void namedBarrierInit() {}
+
+void namedBarrier() {
+  uint32_t NumThreads = omp_get_num_threads();
+  ASSERT(NumThreads % mapping::getWarpSize() == 0);
+  getThreadEnvironment()->namedBarrier(true);
+}
+
+void fenceTeam(int) { getThreadEnvironment()->fenceTeam(); }
+
+void fenceKernel(int memory_order) {
+  getThreadEnvironment()->fenceKernel(memory_order);
+}
+
+// Simply call fenceKernel because there is no need to sync with host
+void fenceSystem(int) { fenceKernel(0); }
+
+void syncWarp(__kmpc_impl_lanemask_t Mask) {
+  getThreadEnvironment()->syncWarp();
+}
+
+void syncThreads() { getThreadEnvironment()->namedBarrier(false); }
+
+constexpr uint32_t OMP_SPIN = 1000;
+constexpr uint32_t UNSET = 0;
+constexpr uint32_t SET = 1;
+
+// TODO: This seems to hide a bug in the declare variant handling. If it is
+// called before it is defined
+//       here the overload won't happen. Investigate lalter!
+void unsetLock(omp_lock_t *Lock) {
+  (void)atomicExchange((uint32_t *)Lock, UNSET, __ATOMIC_SEQ_CST);
+}
+
+int testLock(omp_lock_t *Lock) {
+  return atomicAdd((uint32_t *)Lock, 0u, __ATOMIC_SEQ_CST);
+}
+
+void initLock(omp_lock_t *Lock) { unsetLock(Lock); }
+
+void destoryLock(omp_lock_t *Lock) { unsetLock(Lock); }
+
+void setLock(omp_lock_t *Lock) {
+  VGPUImpl::setLock((uint32_t *)Lock, UNSET, SET, OMP_SPIN,
+                    mapping::getBlockId(), atomicCAS);
+}
+
+} // namespace impl
+
+#pragma omp end declare variant
+///}
+
 void synchronize::init(bool IsSPMD) {
   if (!IsSPMD)
     impl::namedBarrierInit();
Index: openmp/libomptarget/DeviceRTL/src/Misc.cpp
===================================================================
--- openmp/libomptarget/DeviceRTL/src/Misc.cpp
+++ openmp/libomptarget/DeviceRTL/src/Misc.cpp
@@ -16,10 +16,9 @@
 namespace _OMP {
 namespace impl {
 
-/// AMDGCN Implementation
+/// Generic Implementation - AMDGCN, VGPU
 ///
 ///{
-#pragma omp begin declare variant match(device = {arch(amdgcn)})
 
 double getWTick() { return ((double)1E-9); }
 
@@ -31,8 +30,6 @@
   return 0;
 }
 
-#pragma omp end declare variant
-
 /// NVPTX Implementation
 ///
 ///{
Index: openmp/libomptarget/DeviceRTL/src/Mapping.cpp
===================================================================
--- openmp/libomptarget/DeviceRTL/src/Mapping.cpp
+++ openmp/libomptarget/DeviceRTL/src/Mapping.cpp
@@ -156,14 +156,90 @@
          mapping::getWarpSize();
 }
 
+uint32_t getWarpSize() { return getGridValue().GV_Warp_Size; }
+
 #pragma omp end declare variant
 ///}
 
-uint32_t getWarpSize() { return getGridValue().GV_Warp_Size; }
+} // namespace impl
+} // namespace _OMP
+
+/// Virtual GPU Implementation
+///
+///{
+#pragma omp begin declare variant match(                                       \
+    device = {arch(x86, x86_64)}, implementation = {extension(match_any)})
+
+#include "DeviceEnvironment.h"
+
+namespace _OMP {
+namespace impl {
+
+constexpr const llvm::omp::GV &getGridValue() {
+  return llvm::omp::VirtualGpuGridValues;
+}
+
+LaneMaskTy activemask() {
+  uint64_t B = 0;
+  uint32_t N = mapping::getWarpSize();
+  while (N)
+    B |= (1 << (--N));
+  return B;
+}
+
+LaneMaskTy lanemaskLT() {
+  const uint32_t Lane = mapping::getThreadIdInWarp();
+  LaneMaskTy Ballot = mapping::activemask();
+  LaneMaskTy Mask = ((LaneMaskTy)1 << Lane) - (LaneMaskTy)1;
+  return Mask & Ballot;
+}
+
+LaneMaskTy lanemaskGT() {
+  const uint32_t Lane = mapping::getThreadIdInWarp();
+  if (Lane == (mapping::getWarpSize() - 1))
+    return 0;
+  LaneMaskTy Ballot = mapping::activemask();
+  LaneMaskTy Mask = (~((LaneMaskTy)0)) << (Lane + 1);
+  return Mask & Ballot;
+}
+
+uint32_t getThreadIdInWarp() {
+  return mapping::getThreadIdInBlock() & (mapping::getWarpSize() - 1);
+}
+
+uint32_t getThreadIdInBlock() {
+  return getThreadEnvironment()->getThreadIdInBlock();
+}
+
+uint32_t getBlockSize() { return getThreadEnvironment()->getBlockSize(); }
+
+uint32_t getKernelSize() { return getThreadEnvironment()->getKernelSize(); }
+
+uint32_t getBlockId() { return getThreadEnvironment()->getBlockId(); }
+
+uint32_t getNumberOfBlocks() {
+  return getThreadEnvironment()->getNumberOfBlocks();
+}
+
+uint32_t getNumberOfProcessorElements() { return mapping::getBlockSize(); }
+
+uint32_t getWarpId() {
+  return mapping::getThreadIdInBlock() / mapping::getWarpSize();
+}
+
+uint32_t getWarpSize() { return getThreadEnvironment()->getWarpSize(); }
+
+uint32_t getNumberOfWarpsInBlock() {
+  return (mapping::getBlockSize() + mapping::getWarpSize() - 1) /
+         mapping::getWarpSize();
+}
 
 } // namespace impl
 } // namespace _OMP
 
+#pragma omp end declare variant
+///}
+
 bool mapping::isMainThreadInGenericMode(bool IsSPMD) {
   if (IsSPMD || icv::Level)
     return false;
@@ -237,5 +313,13 @@
 __attribute__((noinline)) uint32_t __kmpc_get_hardware_num_threads_in_block() {
   return mapping::getNumberOfProcessorElements();
 }
+
+__attribute__((noinline)) uint32_t __kmpc_get_warp_size() {
+  return mapping::getWarpSize();
+}
+
+__attribute__((noinline)) uint32_t __kmpc_get_block_size() {
+  return mapping::getBlockSize();
+}
 }
 #pragma omp end declare target
Index: openmp/libomptarget/DeviceRTL/src/Kernel.cpp
===================================================================
--- openmp/libomptarget/DeviceRTL/src/Kernel.cpp
+++ openmp/libomptarget/DeviceRTL/src/Kernel.cpp
@@ -103,6 +103,8 @@
   if (IsSPMD)
     return;
 
+  synchronize::threads();
+
   // Signal the workers to exit the state machine and exit the kernel.
   state::ParallelRegionFn = nullptr;
 }
Index: openmp/libomptarget/DeviceRTL/include/Interface.h
===================================================================
--- openmp/libomptarget/DeviceRTL/include/Interface.h
+++ openmp/libomptarget/DeviceRTL/include/Interface.h
@@ -352,6 +352,16 @@
 int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size);
 int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size);
 ///}
+
+/// Target properties
+///
+///{
+uint32_t __kmpc_get_warp_size();
+
+uint32_t __kmpc_get_block_size();
+
+uint32_t __kmpc_get_thread_id_in_block();
+///}
 }
 
 #endif
Index: openmp/libomptarget/DeviceRTL/CMakeLists.txt
===================================================================
--- openmp/libomptarget/DeviceRTL/CMakeLists.txt
+++ openmp/libomptarget/DeviceRTL/CMakeLists.txt
@@ -270,3 +270,68 @@
   # Install bitcode library under the lib destination folder.
   install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} DESTINATION "${OPENMP_INSTALL_LIBDIR}")
 endforeach()
+
+set(bc_flags -S -x c++ -std=c++20
+              ${clang_opt_flags}
+             -target x86_64-vgpu
+             -Xclang -emit-llvm-bc
+             -Xclang -aux-triple -Xclang ${aux_triple}
+             -fopenmp -fopenmp-cuda-mode -Xclang -fopenmp-is-device
+             -Xclang -target-feature 
+             -I${include_directory}
+             -I${devicertl_base_directory}/../include
+             ${LIBOMPTARGET_LLVM_INCLUDE_DIRS_DEVICERTL}
+             -stdlib=libc++
+)
+
+add_custom_target(omptarget-vgpu-bc)
+
+set(bc_files "")
+foreach(src ${src_files})
+  get_filename_component(infile ${src} ABSOLUTE)
+  get_filename_component(outfile ${src} NAME)
+  set(outfile "${outfile}-vgpu.bc")
+
+  add_custom_command(OUTPUT ${outfile}
+    COMMAND ${cuda_compiler} ${bc_flags}
+      ${infile} -o ${outfile}
+    DEPENDS ${infile}
+    IMPLICIT_DEPENDS CXX ${infile}
+    COMMENT "Building LLVM bitcode ${outfile}"
+    VERBATIM
+  )
+  set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile})
+
+  list(APPEND bc_files ${outfile})
+endforeach()
+
+set(bclib_name "libomptarget-x86_64-vgpu.bc")
+
+# Link to a bitcode library.
+add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
+    COMMAND ${bc_linker}
+      -o ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} ${bc_files}
+    DEPENDS ${bc_files}
+    COMMENT "Linking LLVM bitcode ${bclib_name}"
+)
+
+add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}_opt
+    COMMAND ${opt}
+      -O1 -o ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
+    DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
+    COMMENT "Optimizing LLVM bitcode ${bclib_name}"
+)
+set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${bclib_name})
+
+set(bclib_target_name "omptarget-x86_64-vgpu-bc")
+
+add_custom_target(${bclib_target_name} ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}_opt)
+add_dependencies(omptarget-vgpu-bc ${bclib_target_name})
+
+# Copy library to destination.
+add_custom_command(TARGET ${bclib_target_name} POST_BUILD
+                  COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
+                  ${LIBOMPTARGET_LIBRARY_DIR})
+
+# Install bitcode library under the lib destination folder.
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} DESTINATION "${OPENMP_INSTALL_LIBDIR}")
Index: openmp/CMakeLists.txt
===================================================================
--- openmp/CMakeLists.txt
+++ openmp/CMakeLists.txt
@@ -39,6 +39,8 @@
     set(OPENMP_TEST_C_COMPILER ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang.exe)
     set(OPENMP_TEST_CXX_COMPILER ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang++.exe)
   endif()
+
+  list(APPEND LIBOMPTARGET_LLVM_INCLUDE_DIRS ${LLVM_MAIN_INCLUDE_DIR} ${LLVM_BINARY_DIR}/include)
 endif()
 
 # Check and set up common compiler flags.
Index: llvm/utils/gn/secondary/clang/lib/CodeGen/BUILD.gn
===================================================================
--- llvm/utils/gn/secondary/clang/lib/CodeGen/BUILD.gn
+++ llvm/utils/gn/secondary/clang/lib/CodeGen/BUILD.gn
@@ -68,6 +68,7 @@
     "CGOpenMPRuntimeAMDGCN.cpp",
     "CGOpenMPRuntimeGPU.cpp",
     "CGOpenMPRuntimeNVPTX.cpp",
+    "CGOpenMPRuntimeVirtualGPU.cpp",
     "CGRecordLayoutBuilder.cpp",
     "CGStmt.cpp",
     "CGStmtOpenMP.cpp",
Index: llvm/lib/Transforms/IPO/OpenMPOpt.cpp
===================================================================
--- llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -43,6 +43,9 @@
 #include "llvm/Transforms/Utils/CallGraphUpdater.h"
 #include "llvm/Transforms/Utils/CodeExtractor.h"
 
+#include <unordered_map>
+#include <vector>
+
 using namespace llvm;
 using namespace omp;
 
@@ -785,6 +788,12 @@
       if (PrintOpenMPKernels)
         printKernels();
 
+      {
+        llvm::Triple Triple(M.getTargetTriple());
+        if (Triple.getVendor() == llvm::Triple::VGPU)
+          Changed |= expandSharedVariable();
+      }
+
       Changed |= runAttributor(IsModulePass);
 
       // Recollect uses, in case Attributor deleted any.
@@ -1771,6 +1780,13 @@
   /// the cases we can avoid taking the address of a function.
   bool rewriteDeviceCodeStateMachine();
 
+  /// Expand shared variables when the target doesn't support, such as host
+  /// offloading. Take \p ThreadStates as example here:
+  /// ThreadStates->xxx
+  /// will be expanded to the following form:
+  /// ThreadStatesArray[getThreadId]->xxx
+  bool expandSharedVariable();
+
   ///
   ///}}
 
@@ -2063,6 +2079,102 @@
   return Changed;
 }
 
+static bool replaceUses(Constant &G,
+                        SmallVectorImpl<Instruction *> &InstructionStack,
+                        SmallPtrSetImpl<Value *> &SeenConstants) {
+  bool Changed = false;
+
+  // Go through every Use of G and replace the use accordingly
+  SmallVector<Use *> Uses(make_pointer_range(G.uses()));
+  for (auto *U : Uses) {
+    if (auto *C = dyn_cast<ConstantExpr>(U->getUser())) {
+      assert(SeenConstants.insert(C).second &&
+             "Constant has two operands that need to be replaced, not "
+             "supported yet!");
+      Instruction *ConstAsInst = C->getAsInstruction();
+      ConstAsInst->setOperand(U->getOperandNo(), InstructionStack.back());
+      InstructionStack.push_back(ConstAsInst);
+      Changed |= replaceUses(*C, InstructionStack, SeenConstants);
+      assert(InstructionStack.back() == ConstAsInst && "Stack broken!");
+      InstructionStack.pop_back();
+      ConstAsInst->deleteValue();
+    } else if (auto *UserI = dyn_cast<Instruction>(U->getUser())) {
+      Instruction *LastInst = nullptr;
+      Instruction *InstClone = nullptr;
+      for (auto *Inst : InstructionStack) {
+        Instruction *LastInstClone = InstClone;
+        InstClone = Inst->clone();
+        if (LastInstClone)
+          InstClone->replaceUsesOfWith(LastInst, LastInstClone);
+        Instruction *IP = UserI;
+        if (auto *PHI = dyn_cast<PHINode>(UserI))
+          IP = PHI->getIncomingBlock(*U)->getTerminator();
+        InstClone->insertBefore(IP);
+        LastInst = Inst;
+      }
+      // assert(InstClone && InstClone->getNextNode() == UserI);
+      UserI->setOperand(U->getOperandNo(), InstClone);
+      Changed = true;
+    } else {
+      U->getUser()->dump();
+      // llvm_unreachable("Unknown User!\n");
+    }
+  }
+
+  return Changed;
+}
+
+bool OpenMPOpt::expandSharedVariable() {
+  bool Changed = false;
+
+  constexpr const uint64_t MaxNumBlocks = 256;
+
+  FunctionCallee BlockIdFn = OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
+      M, OMPRTL_omp_get_team_num);
+
+  auto *Int32Zero = ConstantInt::get(OMPInfoCache.OMPBuilder.Int32, 0);
+  SmallPtrSet<Value *, 16> SeenConstants;
+  SmallVector<Instruction *> InstructionStack;
+
+  for (auto &G : M.globals()) {
+    // TODO: Rewrite with enum value
+    if (G.getAddressSpace() != 3)
+      continue;
+
+    // Create a new array
+    PointerType *GlobalType =
+        PointerType::get(G.getType()->getElementType(), 3);
+    ArrayType *TheArrayType =
+        ArrayType::get(GlobalType->getElementType(), MaxNumBlocks);
+    Constant *TheArray = new GlobalVariable(
+        M, TheArrayType, /* isConstant */ false, GlobalValue::PrivateLinkage,
+        UndefValue::get(TheArrayType), G.getName() + ".array");
+
+    TheArray = ConstantExpr::getAddrSpaceCast(
+        TheArray,
+        PointerType::get(TheArray->getType()->getPointerElementType(), 3));
+
+    auto *BlockId = CallInst::Create(BlockIdFn, {}, "block_id");
+    InstructionStack.push_back(BlockId);
+    auto *NewElement = GetElementPtrInst::Create(
+        nullptr, TheArray, {Int32Zero, BlockId}, G.getName());
+    InstructionStack.push_back(NewElement);
+
+    Changed |= replaceUses(G, InstructionStack, SeenConstants);
+    assert(InstructionStack.back() == NewElement && "Broken stack!");
+
+    InstructionStack.pop_back();
+    NewElement->deleteValue();
+    assert(InstructionStack.back() == BlockId && "Broken stack!");
+
+    InstructionStack.pop_back();
+    BlockId->deleteValue();
+    assert(InstructionStack.empty() && "Broken stack!");
+  }
+
+  return Changed;
+}
+
 /// Abstract Attribute for tracking ICV values.
 struct AAICVTracker : public StateWrapper<BooleanState, AbstractAttribute> {
   using Base = StateWrapper<BooleanState, AbstractAttribute>;
Index: llvm/lib/Support/Triple.cpp
===================================================================
--- llvm/lib/Support/Triple.cpp
+++ llvm/lib/Support/Triple.cpp
@@ -179,6 +179,8 @@
   case PC: return "pc";
   case SCEI: return "scei";
   case SUSE: return "suse";
+  case VGPU:
+    return "vgpu";
   }
 
   llvm_unreachable("Invalid VendorType!");
@@ -482,22 +484,23 @@
 
 static Triple::VendorType parseVendor(StringRef VendorName) {
   return StringSwitch<Triple::VendorType>(VendorName)
-    .Case("apple", Triple::Apple)
-    .Case("pc", Triple::PC)
-    .Case("scei", Triple::SCEI)
-    .Case("sie", Triple::SCEI)
-    .Case("fsl", Triple::Freescale)
-    .Case("ibm", Triple::IBM)
-    .Case("img", Triple::ImaginationTechnologies)
-    .Case("mti", Triple::MipsTechnologies)
-    .Case("nvidia", Triple::NVIDIA)
-    .Case("csr", Triple::CSR)
-    .Case("myriad", Triple::Myriad)
-    .Case("amd", Triple::AMD)
-    .Case("mesa", Triple::Mesa)
-    .Case("suse", Triple::SUSE)
-    .Case("oe", Triple::OpenEmbedded)
-    .Default(Triple::UnknownVendor);
+      .Case("apple", Triple::Apple)
+      .Case("pc", Triple::PC)
+      .Case("scei", Triple::SCEI)
+      .Case("sie", Triple::SCEI)
+      .Case("fsl", Triple::Freescale)
+      .Case("ibm", Triple::IBM)
+      .Case("img", Triple::ImaginationTechnologies)
+      .Case("mti", Triple::MipsTechnologies)
+      .Case("nvidia", Triple::NVIDIA)
+      .Case("csr", Triple::CSR)
+      .Case("myriad", Triple::Myriad)
+      .Case("amd", Triple::AMD)
+      .Case("mesa", Triple::Mesa)
+      .Case("suse", Triple::SUSE)
+      .Case("oe", Triple::OpenEmbedded)
+      .Case("vgpu", Triple::VGPU)
+      .Default(Triple::UnknownVendor);
 }
 
 static Triple::OSType parseOS(StringRef OSName) {
Index: llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
===================================================================
--- llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -222,6 +222,7 @@
 __OMP_RTL(omp_get_level, false, Int32, )
 __OMP_RTL(omp_get_ancestor_thread_num, false, Int32, Int32)
 __OMP_RTL(omp_get_team_size, false, Int32, Int32)
+__OMP_RTL(omp_get_team_num, false, Int32, )
 __OMP_RTL(omp_get_active_level, false, Int32, )
 __OMP_RTL(omp_in_final, false, Int32, )
 __OMP_RTL(omp_get_proc_bind, false, Int32, )
@@ -454,6 +455,10 @@
 __OMP_RTL(__kmpc_warp_active_thread_mask, false, Int64,)
 __OMP_RTL(__kmpc_syncwarp, false, Void, Int64)
 
+__OMP_RTL(__kmpc_get_warp_size, false, Int32, )
+__OMP_RTL(__kmpc_get_block_size, false, Int32, )
+__OMP_RTL(__kmpc_get_thread_id_in_block, false, Int32, )
+
 __OMP_RTL(__kmpc_is_generic_main_thread_id, false, Int8, Int32)
 
 __OMP_RTL(__last, false, Void, )
@@ -645,6 +650,7 @@
 __OMP_RTL_ATTRS(omp_get_ancestor_thread_num, GetterAttrs, AttributeSet(),
                 ParamAttrs())
 __OMP_RTL_ATTRS(omp_get_team_size, GetterAttrs, AttributeSet(), ParamAttrs())
+__OMP_RTL_ATTRS(omp_get_team_num, GetterAttrs, AttributeSet(), ParamAttrs())
 __OMP_RTL_ATTRS(omp_get_active_level, GetterAttrs, AttributeSet(), ParamAttrs())
 __OMP_RTL_ATTRS(omp_in_final, GetterAttrs, AttributeSet(), ParamAttrs())
 __OMP_RTL_ATTRS(omp_get_proc_bind, GetterAttrs, AttributeSet(), ParamAttrs())
Index: llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h
===================================================================
--- llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h
+++ llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h
@@ -114,6 +114,16 @@
     128,  // GV_Default_WG_Size
 };
 
+/// For Virtual GPUs
+static constexpr GV VirtualGpuGridValues = {
+    256,  // GV_Slot_Size
+    32,   // GV_Warp_Size
+    1024, // GV_Max_Teams
+    896,  // GV_SimpleBufferSize
+    1024, // GV_Max_WG_Size
+    128,  // GV_Defaut_WG_Size
+};
+
 } // namespace omp
 } // namespace llvm
 
Index: llvm/include/llvm/ADT/Triple.h
===================================================================
--- llvm/include/llvm/ADT/Triple.h
+++ llvm/include/llvm/ADT/Triple.h
@@ -162,7 +162,8 @@
     Mesa,
     SUSE,
     OpenEmbedded,
-    LastVendorType = OpenEmbedded
+    VGPU,
+    LastVendorType = VGPU
   };
   enum OSType {
     UnknownOS,
Index: clang/lib/Frontend/CompilerInvocation.cpp
===================================================================
--- clang/lib/Frontend/CompilerInvocation.cpp
+++ clang/lib/Frontend/CompilerInvocation.cpp
@@ -3983,8 +3983,10 @@
   }
 
   // Set CUDA mode for OpenMP target NVPTX/AMDGCN if specified in options
-  Opts.OpenMPCUDAMode = Opts.OpenMPIsDevice && (T.isNVPTX() || T.isAMDGCN()) &&
-                        Args.hasArg(options::OPT_fopenmp_cuda_mode);
+  Opts.OpenMPCUDAMode =
+      Opts.OpenMPIsDevice &&
+      (T.isNVPTX() || T.isAMDGCN() || T.getVendor() == llvm::Triple::VGPU) &&
+      Args.hasArg(options::OPT_fopenmp_cuda_mode);
 
   // Set CUDA mode for OpenMP target NVPTX/AMDGCN if specified in options
   Opts.OpenMPCUDAForceFullRuntime =
Index: clang/lib/Driver/ToolChains/Gnu.cpp
===================================================================
--- clang/lib/Driver/ToolChains/Gnu.cpp
+++ clang/lib/Driver/ToolChains/Gnu.cpp
@@ -3078,4 +3078,13 @@
   if (!DriverArgs.hasFlag(options::OPT_fuse_init_array,
                           options::OPT_fno_use_init_array, true))
     CC1Args.push_back("-fno-use-init-array");
+
+  if (DriverArgs.hasArg(options::OPT_S))
+    return;
+
+  if (getTriple().getVendor() == llvm::Triple::VGPU) {
+    std::string BitcodeSuffix = "x86_64-vgpu";
+    clang::driver::tools::addOpenMPDeviceRTL(getDriver(), DriverArgs, CC1Args,
+                                             BitcodeSuffix, getTriple());
+  }
 }
Index: clang/lib/CodeGen/CodeGenModule.cpp
===================================================================
--- clang/lib/CodeGen/CodeGenModule.cpp
+++ clang/lib/CodeGen/CodeGenModule.cpp
@@ -21,6 +21,7 @@
 #include "CGOpenMPRuntime.h"
 #include "CGOpenMPRuntimeAMDGCN.h"
 #include "CGOpenMPRuntimeNVPTX.h"
+#include "CGOpenMPRuntimeVirtualGPU.h"
 #include "CodeGenFunction.h"
 #include "CodeGenPGO.h"
 #include "ConstantEmitter.h"
@@ -254,7 +255,9 @@
     OpenMPRuntime.reset(new CGOpenMPRuntimeAMDGCN(*this));
     break;
   default:
-    if (LangOpts.OpenMPSimd)
+    if (getTriple().getVendor() == llvm::Triple::VGPU) {
+      OpenMPRuntime.reset(new CGOpenMPRuntimeVirtualGPU(*this));
+    } else if (LangOpts.OpenMPSimd)
       OpenMPRuntime.reset(new CGOpenMPSIMDRuntime(*this));
     else
       OpenMPRuntime.reset(new CGOpenMPRuntime(*this));
Index: clang/lib/CodeGen/CMakeLists.txt
===================================================================
--- clang/lib/CodeGen/CMakeLists.txt
+++ clang/lib/CodeGen/CMakeLists.txt
@@ -62,6 +62,7 @@
   CGOpenMPRuntimeAMDGCN.cpp
   CGOpenMPRuntimeGPU.cpp
   CGOpenMPRuntimeNVPTX.cpp
+  CGOpenMPRuntimeVirtualGPU.cpp
   CGRecordLayoutBuilder.cpp
   CGStmt.cpp
   CGStmtOpenMP.cpp
Index: clang/lib/CodeGen/CGOpenMPRuntimeVirtualGPU.h
===================================================================
--- /dev/null
+++ clang/lib/CodeGen/CGOpenMPRuntimeVirtualGPU.h
@@ -0,0 +1,44 @@
+//== CGOpenMPRuntimeVirtualGPU.h - Interface to OpenMP Virtual GPU Runtimes ==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This provides a class for OpenMP runtime code generation specialized to
+// virtual GPU from generalized CGOpenMPRuntimeGPU class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_LIB_CODEGEN_CGOpenMPRuntimeVirtualGPU_H
+#define LLVM_CLANG_LIB_CODEGEN_CGOpenMPRuntimeVirtualGPU_H
+
+#include "CGOpenMPRuntime.h"
+#include "CGOpenMPRuntimeGPU.h"
+#include "CodeGenFunction.h"
+#include "clang/AST/StmtOpenMP.h"
+
+namespace clang {
+namespace CodeGen {
+
+class CGOpenMPRuntimeVirtualGPU final : public CGOpenMPRuntimeGPU {
+
+public:
+  explicit CGOpenMPRuntimeVirtualGPU(CodeGenModule &CGM);
+
+  /// Get the GPU warp size.
+  llvm::Value *getGPUWarpSize(CodeGenFunction &CGF) override;
+
+  /// Get the id of the current thread on the GPU.
+  llvm::Value *getGPUThreadID(CodeGenFunction &CGF) override;
+
+  void createOffloadEntry(llvm::Constant *ID, llvm::Constant *Addr,
+                          uint64_t Size, int32_t Flags,
+                          llvm::GlobalValue::LinkageTypes Linkage) override;
+};
+
+} // namespace CodeGen
+} // namespace clang
+
+#endif // LLVM_CLANG_LIB_CODEGEN_CGOpenMPRuntimeVirtualGPU_H
Index: clang/lib/CodeGen/CGOpenMPRuntimeVirtualGPU.cpp
===================================================================
--- /dev/null
+++ clang/lib/CodeGen/CGOpenMPRuntimeVirtualGPU.cpp
@@ -0,0 +1,54 @@
+//= CGOpenMPRuntimeVirtualGPU.cpp - Interface to OpenMP Virtual GPU Runtimes =//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This provides a class for OpenMP runtime code generation specialized to
+// virtual GPU target from generalized CGOpenMPRuntimeGPU class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CGOpenMPRuntimeVirtualGPU.h"
+#include "CGOpenMPRuntimeGPU.h"
+#include "CodeGenFunction.h"
+#include "clang/AST/Attr.h"
+#include "clang/AST/DeclOpenMP.h"
+#include "clang/AST/StmtOpenMP.h"
+#include "clang/AST/StmtVisitor.h"
+#include "clang/Basic/Cuda.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/IR/IntrinsicsNVPTX.h"
+
+using namespace clang;
+using namespace CodeGen;
+using namespace llvm::omp;
+
+CGOpenMPRuntimeVirtualGPU::CGOpenMPRuntimeVirtualGPU(CodeGenModule &CGM)
+    : CGOpenMPRuntimeGPU(CGM) {
+  if (!CGM.getLangOpts().OpenMPIsDevice)
+    llvm_unreachable("OpenMP Virtual GPU can only handle device code.");
+}
+
+llvm::Value *CGOpenMPRuntimeVirtualGPU::getGPUWarpSize(CodeGenFunction &CGF) {
+  ArrayRef<llvm::Value *> Args{};
+  return CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
+                                 CGM.getModule(), OMPRTL___kmpc_get_warp_size),
+                             Args);
+}
+
+llvm::Value *CGOpenMPRuntimeVirtualGPU::getGPUThreadID(CodeGenFunction &CGF) {
+  ArrayRef<llvm::Value *> Args{};
+  return CGF.EmitRuntimeCall(
+      OMPBuilder.getOrCreateRuntimeFunction(
+          CGM.getModule(), OMPRTL___kmpc_get_thread_id_in_block),
+      Args);
+}
+
+void CGOpenMPRuntimeVirtualGPU::createOffloadEntry(
+    llvm::Constant *ID, llvm::Constant *Addr, uint64_t Size, int32_t Flags,
+    llvm::GlobalValue::LinkageTypes Linkage) {
+  CGOpenMPRuntime::createOffloadEntry(ID, Addr, Size, Flags, Linkage);
+}
Index: clang/lib/Basic/Targets/X86.h
===================================================================
--- clang/lib/Basic/Targets/X86.h
+++ clang/lib/Basic/Targets/X86.h
@@ -17,6 +17,7 @@
 #include "clang/Basic/TargetInfo.h"
 #include "clang/Basic/TargetOptions.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/Frontend/OpenMP/OMPGridValues.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/X86TargetParser.h"
 
@@ -45,6 +46,28 @@
     272  // ptr64
 };
 
+static const unsigned X86VGPUAddrSpaceMap[] = {
+    0,   // Default
+    1,   // opencl_global
+    3,   // opencl_local
+    4,   // opencl_constant
+    0,   // opencl_private
+    0,   // opencl_generic
+    1,   // opencl_global_device
+    1,   // opencl_global_host
+    1,   // cuda_device
+    4,   // cuda_constant
+    3,   // cuda_shared
+    1,   // sycl_global
+    0,   // sycl_global_device
+    0,   // sycl_global_host
+    3,   // sycl_local
+    0,   // sycl_private
+    270, // ptr32_sptr
+    271, // ptr32_uptr
+    272  // ptr64
+};
+
 // X86 target abstract base class; x86-32 and x86-64 are very close, so
 // most of the implementation can be shared.
 class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo {
@@ -161,6 +184,9 @@
         getTriple().isOSWindows() && getTriple().isOSBinFormatCOFF();
     if (IsWinCOFF)
       MaxVectorAlign = MaxTLSAlign = 8192u * getCharWidth();
+
+    if (Triple.getVendor() == llvm::Triple::VGPU)
+      AddrSpaceMap = &X86VGPUAddrSpaceMap;
   }
 
   const char *getLongDoubleMangling() const override {
@@ -387,6 +413,10 @@
   uint64_t getPointerAlignV(unsigned AddrSpace) const override {
     return getPointerWidthV(AddrSpace);
   }
+
+  const llvm::omp::GV &getGridValue() const override {
+    return llvm::omp::VirtualGpuGridValues;
+  }
 };
 
 // X86-32 generic target

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D113359: [Libomptarget][WIP] Introduce VGPU Plugin

Reply via email to