================
@@ -0,0 +1,888 @@
+//===- InstrProfilingPlatformROCm.cpp - Profile data ROCm platform -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+extern "C" {
+#include "InstrProfiling.h"
+#include "InstrProfilingInternal.h"
+#include "InstrProfilingPort.h"
+}
+
+#include "interception/interception.h"
+// C library headers (not <cstdio> etc.): clang_rt.profile is built with
+// -nostdinc++ and avoids the C++ standard library (see 
profile/CMakeLists.txt).
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef _WIN32
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#else
+#include <pthread.h>
+#endif
+
+/* Serialize one-time HIP loader resolution and DynamicModules mutations.
+ * Inline to avoid a sanitizer_common dependency. */
+#ifdef _WIN32
+static INIT_ONCE HipLoadedOnce = INIT_ONCE_STATIC_INIT;
+static CRITICAL_SECTION DynamicModulesLock;
+static INIT_ONCE DynamicModulesLockInit = INIT_ONCE_STATIC_INIT;
+static BOOL CALLBACK initDynamicModulesLockCb(PINIT_ONCE, PVOID, PVOID *) {
+  InitializeCriticalSection(&DynamicModulesLock);
+  return TRUE;
+}
+static void lockDynamicModules(void) {
+  InitOnceExecuteOnce(&DynamicModulesLockInit, initDynamicModulesLockCb, NULL,
+                      NULL);
+  EnterCriticalSection(&DynamicModulesLock);
+}
+static void unlockDynamicModules(void) {
+  LeaveCriticalSection(&DynamicModulesLock);
+}
+#else
+static pthread_once_t HipLoadedOnce = PTHREAD_ONCE_INIT;
+static pthread_mutex_t DynamicModulesLock = PTHREAD_MUTEX_INITIALIZER;
+static void lockDynamicModules(void) {
+  pthread_mutex_lock(&DynamicModulesLock);
+}
+static void unlockDynamicModules(void) {
+  pthread_mutex_unlock(&DynamicModulesLock);
+}
+#endif
+
+static int processDeviceOffloadPrf(void *DeviceOffloadPrf, int TUIndex,
+                                   const char *Target);
+
+static int isVerboseMode() {
+  static int IsVerbose = -1;
+  if (IsVerbose == -1)
+    IsVerbose = getenv("LLVM_PROFILE_VERBOSE") != nullptr;
+  return IsVerbose;
+}
+
+/* -------------------------------------------------------------------------- 
*/
+/*  Dynamic loading of HIP runtime symbols                                   */
+/* -------------------------------------------------------------------------- 
*/
+
+typedef int (*hipGetSymbolAddressTy)(void **, const void *);
+typedef int (*hipMemcpyTy)(void *, const void *, size_t, int);
+typedef int (*hipModuleGetGlobalTy)(void **, size_t *, void *, const char *);
+typedef int (*hipGetDeviceCountTy)(int *);
+typedef int (*hipGetDeviceTy)(int *);
+typedef int (*hipSetDeviceTy)(int);
+
+/* Minimal hipDeviceProp_t (HIP 6.x R0600): only gcnArchName at offset 1160
+ * is read. Padded to 4096 to tolerate ABI growth. */
+typedef struct {
+  char padding[1160];
+  char gcnArchName[256];
+  char tail_padding[2680];
+} HipDevicePropMinimal;
+typedef int (*hipGetDevicePropertiesTy)(HipDevicePropMinimal *, int);
+
+static hipGetSymbolAddressTy pHipGetSymbolAddress = nullptr;
+static hipMemcpyTy pHipMemcpy = nullptr;
+static hipModuleGetGlobalTy pHipModuleGetGlobal = nullptr;
+static hipGetDeviceCountTy pHipGetDeviceCount = nullptr;
+static hipGetDeviceTy pHipGetDevice = nullptr;
+static hipSetDeviceTy pHipSetDevice = nullptr;
+static hipGetDevicePropertiesTy pHipGetDeviceProperties = nullptr;
+
+#define MAX_DEVICES 16
+static int NumDevices = 0;
+static char DeviceArchNames[MAX_DEVICES][256];
+
+/* -------------------------------------------------------------------------- 
*/
+/*  Device-to-host copies                                                     
*/
+/*  Keep HIP-only to avoid an HSA dependency.                                 
*/
+/* -------------------------------------------------------------------------- 
*/
+
+static void doEnsureHipLoaded(void) {
+  if (!__interception::DynamicLoaderAvailable()) {
+    if (isVerboseMode())
+      PROF_NOTE("%s", "Dynamic library loading not available - "
+                      "HIP profiling disabled\n");
+    return;
+  }
+
+#ifdef _WIN32
+  static const char HipLibName[] = "amdhip64.dll";
+#else
+  static const char HipLibName[] = "libamdhip64.so";
+#endif
+
+  void *Handle = __interception::OpenLibrary(HipLibName);
+  if (!Handle)
+    return;
+
+  pHipGetSymbolAddress = (hipGetSymbolAddressTy)__interception::LookupSymbol(
+      Handle, "hipGetSymbolAddress");
+  pHipMemcpy = (hipMemcpyTy)__interception::LookupSymbol(Handle, "hipMemcpy");
+  pHipModuleGetGlobal = (hipModuleGetGlobalTy)__interception::LookupSymbol(
+      Handle, "hipModuleGetGlobal");
+  pHipGetDeviceCount = (hipGetDeviceCountTy)__interception::LookupSymbol(
+      Handle, "hipGetDeviceCount");
+  pHipGetDevice =
+      (hipGetDeviceTy)__interception::LookupSymbol(Handle, "hipGetDevice");
+  pHipSetDevice =
+      (hipSetDeviceTy)__interception::LookupSymbol(Handle, "hipSetDevice");
+  pHipGetDeviceProperties =
+      (hipGetDevicePropertiesTy)__interception::LookupSymbol(
+          Handle, "hipGetDevicePropertiesR0600");
+  if (!pHipGetDeviceProperties)
+    pHipGetDeviceProperties =
+        (hipGetDevicePropertiesTy)__interception::LookupSymbol(
+            Handle, "hipGetDeviceProperties");
+
+  if (pHipGetDeviceCount && pHipGetDeviceProperties) {
+    int Count = 0;
+    if (pHipGetDeviceCount(&Count) == 0) {
+      if (Count > MAX_DEVICES)
+        Count = MAX_DEVICES;
+      HipDevicePropMinimal Prop;
+      for (int i = 0; i < Count; ++i) {
+        __builtin_memset(&Prop, 0, sizeof(Prop));
+        if (pHipGetDeviceProperties(&Prop, i) == 0) {
+          strncpy(DeviceArchNames[i], Prop.gcnArchName,
+                  sizeof(DeviceArchNames[i]) - 1);
+          DeviceArchNames[i][sizeof(DeviceArchNames[i]) - 1] = '\0';
+          if (isVerboseMode())
+            PROF_NOTE("Device %d arch: %s\n", i, DeviceArchNames[i]);
+        }
+      }
+      NumDevices = Count;
+    }
+  }
+}
+
+#ifdef _WIN32
+static BOOL CALLBACK ensureHipLoadedCb(PINIT_ONCE, PVOID, PVOID *) {
+  doEnsureHipLoaded();
+  return TRUE;
+}
+#endif
+
+static void ensureHipLoaded(void) {
+#ifdef _WIN32
+  InitOnceExecuteOnce(&HipLoadedOnce, ensureHipLoadedCb, NULL, NULL);
+#else
+  pthread_once(&HipLoadedOnce, doEnsureHipLoaded);
+#endif
+}
+
+/* -------------------------------------------------------------------------- 
*/
+/*  Public wrappers that forward to the loaded HIP symbols                   */
+/* -------------------------------------------------------------------------- 
*/
+
+static int hipGetSymbolAddress(void **devPtr, const void *symbol) {
+  ensureHipLoaded();
+  return pHipGetSymbolAddress ? pHipGetSymbolAddress(devPtr, symbol) : -1;
+}
+
+static int hipMemcpy(void *dest, const void *src, size_t len,
+                     int kind /*2=DToH*/) {
+  ensureHipLoaded();
+  return pHipMemcpy ? pHipMemcpy(dest, src, len, kind) : -1;
+}
+
+/* Device section symbols must be registered with CLR first; otherwise
+ * hipMemcpy may take a CPU path and crash. */
+static int memcpyDeviceToHost(void *Dst, const void *Src, size_t Size) {
+  return hipMemcpy(Dst, Src, Size, 2 /* DToH */);
+}
+
+static int hipModuleGetGlobal(void **DevPtr, size_t *Bytes, void *Module,
+                              const char *Name) {
+  ensureHipLoaded();
+  return pHipModuleGetGlobal ? pHipModuleGetGlobal(DevPtr, Bytes, Module, Name)
+                             : -1;
+}
+
+static int hipGetDevice(int *DeviceId) {
+  ensureHipLoaded();
+  return pHipGetDevice ? pHipGetDevice(DeviceId) : -1;
+}
+
+static int hipSetDevice(int DeviceId) {
+  ensureHipLoaded();
+  return pHipSetDevice ? pHipSetDevice(DeviceId) : -1;
+}
+
+static const char *getDeviceArchName(int DeviceId) {
+  if (DeviceId < 0 || DeviceId >= NumDevices || !DeviceArchNames[DeviceId][0])
+    return "amdgpu";
+  return DeviceArchNames[DeviceId];
+}
+
+/* -------------------------------------------------------------------------- 
*/
+/*  Dynamic module tracking                                                   
*/
+/* -------------------------------------------------------------------------- 
*/
+
+/* Per-TU profile entry inside a dynamic module.
+ * A single dynamic module may contain multiple TUs (e.g. -fgpu-rdc). */
+typedef struct {
+  void *DeviceVar; /* device address of __llvm_profile_sections_<CUID> */
+  int Processed;   /* 0 = not yet collected, 1 = data already copied   */
+} OffloadDynamicTUInfo;
+
+/* One entry per hipModuleLoad call. */
+typedef struct {
+  void *ModulePtr;           /* hipModule_t handle                        */
+  OffloadDynamicTUInfo *TUs; /* array of per-TU entries                 */
+  int NumTUs;
+  int CapTUs;
+} OffloadDynamicModuleInfo;
+
+static OffloadDynamicModuleInfo *DynamicModules = nullptr;
+static int NumDynamicModules = 0;
+static int CapDynamicModules = 0;
+
+/* -------------------------------------------------------------------------- 
*/
+/*  ELF symbol enumeration (manual parse: compiler-rt cannot link LLVM Support)
+ */
+/* -------------------------------------------------------------------------- 
*/
+
+#if __has_include(<elf.h>)
+#include <elf.h>
+
+/* Callback invoked for every matching symbol name found in the ELF image.
+ * Return 0 to continue iteration, non-zero to stop. */
+typedef int (*SymbolCallback)(const char *Name, void *UserData);
+
+/* If Image is a clang offload bundle, return a pointer to the first embedded
+ * ELF. Returns Image if not a bundle, nullptr if a bundle holds no ELF. */
+static const void *unwrapOffloadBundle(const void *Image) {
+  static const char BundleMagic[] = "__CLANG_OFFLOAD_BUNDLE__";
+  if (memcmp(Image, BundleMagic, sizeof(BundleMagic) - 1) != 0)
+    return Image; /* Not a bundle, return as-is. */
+
+  const char *Buf = (const char *)Image;
+  uint64_t NumEntries;
+  __builtin_memcpy(&NumEntries, Buf + sizeof(BundleMagic) - 1,
+                   sizeof(uint64_t));
+
+  /* Walk the entry table (starts at offset 32). */
+  const char *Cursor = Buf + 32;
+  for (uint64_t I = 0; I < NumEntries; ++I) {
+    uint64_t EntryOffset, EntrySize, IDSize;
+    __builtin_memcpy(&EntryOffset, Cursor, sizeof(EntryOffset));
+    Cursor += sizeof(EntryOffset);
+    __builtin_memcpy(&EntrySize, Cursor, sizeof(EntrySize));
+    Cursor += sizeof(EntrySize);
+    __builtin_memcpy(&IDSize, Cursor, sizeof(IDSize));
+    Cursor += sizeof(IDSize);
+    Cursor += IDSize; /* skip entry ID */
+
+    if (EntrySize >= sizeof(Elf64_Ehdr)) {
+      const Elf64_Ehdr *E = (const Elf64_Ehdr *)(Buf + EntryOffset);
+      if (E->e_ident[EI_MAG0] == ELFMAG0 && E->e_ident[EI_MAG1] == ELFMAG1 &&
+          E->e_ident[EI_MAG2] == ELFMAG2 && E->e_ident[EI_MAG3] == ELFMAG3) {
+        return (const void *)(Buf + EntryOffset);
+      }
+    }
+  }
+
+  PROF_WARN("%s", "offload bundle contains no valid ELF entries\n");
+  return nullptr;
+}
+
+/* Invoke CB for every global symbol in Image (an AMDGPU ELF or offload bundle)
+ * whose name starts with PREFIX. Image may be null. */
+static void enumerateElfSymbols(const void *Image, const char *Prefix,
+                                SymbolCallback CB, void *UserData) {
+  if (!Image)
+    return;
+
+  Image = unwrapOffloadBundle(Image);
+  if (!Image)
+    return;
+
+  const Elf64_Ehdr *Ehdr = (const Elf64_Ehdr *)Image;
+  if (Ehdr->e_ident[EI_MAG0] != ELFMAG0 || Ehdr->e_ident[EI_MAG1] != ELFMAG1 ||
+      Ehdr->e_ident[EI_MAG2] != ELFMAG2 || Ehdr->e_ident[EI_MAG3] != ELFMAG3) {
+    if (isVerboseMode())
+      PROF_NOTE("%s", "Image is not a valid ELF, skipping enumeration\n");
+    return;
+  }
+
+  size_t PrefixLen = strlen(Prefix);
+  const char *Base = (const char *)Image;
+  const Elf64_Shdr *Shdrs = (const Elf64_Shdr *)(Base + Ehdr->e_shoff);
+
+  for (int i = 0; i < Ehdr->e_shnum; ++i) {
+    if (Shdrs[i].sh_type != SHT_SYMTAB)
+      continue;
+
+    const Elf64_Sym *Syms = (const Elf64_Sym *)(Base + Shdrs[i].sh_offset);
+    int NumSyms = Shdrs[i].sh_size / sizeof(Elf64_Sym);
+    /* String table is the section referenced by sh_link. */
+    const char *StrTab = Base + Shdrs[Shdrs[i].sh_link].sh_offset;
+
+    for (int j = 0; j < NumSyms; ++j) {
+      if (Syms[j].st_name == 0)
+        continue;
+      const char *Name = StrTab + Syms[j].st_name;
+      if (strncmp(Name, Prefix, PrefixLen) == 0) {
+        if (CB(Name, UserData))
+          return;
+      }
+    }
+  }
+}
+
+/* State passed through the enumeration callback. */
+typedef struct {
+  void *Module; /* hipModule_t */
+  OffloadDynamicModuleInfo *ModInfo;
+} EnumState;
+
+/* Register one __llvm_profile_sections_<CUID> symbol on the module entry.
+ * hipModuleGetGlobal also registers the device address with CLR so hipMemcpy
+ * can copy from it later. */
+static int registerPrfSymbol(const char *Name, void *UserData) {
+  EnumState *S = (EnumState *)UserData;
+  OffloadDynamicModuleInfo *MI = S->ModInfo;
+
+  /* The symbol is the per-TU sections struct itself, not a pointer
+   * indirection, so this address is the hipMemcpy source. */
+  void *DeviceVar = nullptr;
+  size_t Bytes = 0;
+  if (hipModuleGetGlobal(&DeviceVar, &Bytes, S->Module, Name) != 0) {
+    PROF_WARN("failed to get symbol %s for module %p\n", Name, S->Module);
+    return 0; /* continue */
+  }
+
+  if (MI->NumTUs >= MI->CapTUs) {
+    int NewCap = MI->CapTUs ? MI->CapTUs * 2 : 4;
+    OffloadDynamicTUInfo *New = (OffloadDynamicTUInfo *)realloc(
+        MI->TUs, NewCap * sizeof(OffloadDynamicTUInfo));
+    if (!New) {
+      PROF_ERR("%s\n", "failed to grow TU array");
+      return 0;
+    }
+    MI->TUs = New;
+    MI->CapTUs = NewCap;
+  }
+  OffloadDynamicTUInfo *TU = &MI->TUs[MI->NumTUs++];
+  TU->DeviceVar = DeviceVar;
+  TU->Processed = 0;
+
+  (void)Name;
+  return 0; /* continue enumeration */
+}
+
+#endif /* __has_include(<elf.h>) */
+
+/* -------------------------------------------------------------------------- 
*/
+/*  Registration / un-registration helpers                                   */
+/* -------------------------------------------------------------------------- 
*/
+
+extern "C" void
+__llvm_profile_offload_register_dynamic_module(int ModuleLoadRc, void **Ptr,
+                                               const void *Image) {
+  if (ModuleLoadRc)
+    return;
+
+  lockDynamicModules();
+
+  if (isVerboseMode())
+    PROF_NOTE("Registering loaded module %d: rc=%d, module=%p, image=%p\n",
+              NumDynamicModules, ModuleLoadRc, *Ptr, Image);
+
+  if (NumDynamicModules >= CapDynamicModules) {
+    int NewCap = CapDynamicModules ? CapDynamicModules * 2 : 64;
+    OffloadDynamicModuleInfo *New = (OffloadDynamicModuleInfo *)realloc(
+        DynamicModules, NewCap * sizeof(OffloadDynamicModuleInfo));
+    if (!New) {
+      unlockDynamicModules();
+      return;
+    }
+    DynamicModules = New;
+    CapDynamicModules = NewCap;
+  }
+
+  OffloadDynamicModuleInfo *MI = &DynamicModules[NumDynamicModules++];
+  MI->ModulePtr = *Ptr;
+  MI->TUs = nullptr;
+  MI->NumTUs = 0;
+  MI->CapTUs = 0;
+
+  /* Dynamic-module profiling needs ELF parsing for symbol enumeration. */
+#if __has_include(<elf.h>)
+  EnumState State = {*Ptr, MI};
+  enumerateElfSymbols(Image, "__llvm_profile_sections_", registerPrfSymbol,
+                      &State);
+#else
+  (void)Image;
+  if (isVerboseMode())
+    PROF_NOTE("%s",
+              "Dynamic module profiling not supported on this platform\n");
+#endif
+
+  if (MI->NumTUs == 0) {
+    PROF_WARN("no __llvm_profile_sections_* symbols found in module %p\n",
+              *Ptr);
+  } else if (isVerboseMode()) {
+    PROF_NOTE("Module %p: registered %d TU(s)\n", *Ptr, MI->NumTUs);
+  }
+
+  unlockDynamicModules();
+}
+
+extern "C" void __llvm_profile_offload_unregister_dynamic_module(void *Ptr) {
+  lockDynamicModules();
+  for (int i = 0; i < NumDynamicModules; ++i) {
+    OffloadDynamicModuleInfo *MI = &DynamicModules[i];
+
+    /* HIP recycles hipModule_t addresses; drained slots are cleared so a
+     * recycled handle finds the new slot, not the dead one. */
+    if (MI->ModulePtr != Ptr)
+      continue;
+
+    if (isVerboseMode())
+      PROF_NOTE("Unregistering module %p (%d TUs)\n", MI->ModulePtr,
+                MI->NumTUs);
+
+    for (int t = 0; t < MI->NumTUs; ++t) {
+      OffloadDynamicTUInfo *TU = &MI->TUs[t];
+      if (TU->Processed) {
+        if (isVerboseMode())
+          PROF_NOTE("Module %p TU %d already processed, skipping\n", Ptr, t);
+        continue;
+      }
+      /* Globally unique TU index for the output filename. */
+      int TUIndex = i * 1000 + t;
----------------
yxsamliu wrote:

Each TU of each process has a separate profile data, which needs a distinct 
file name. However, as you suggested, it can be replaced by an atomic counter. 
will do.

https://github.com/llvm/llvm-project/pull/177665
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
  • [clang] [... Yaxun Liu via cfe-commits
    • [cla... Yaxun Liu via cfe-commits
    • [cla... Yaxun Liu via cfe-commits
    • [cla... Yaxun Liu via cfe-commits
    • [cla... Yaxun Liu via cfe-commits
    • [cla... Yaxun Liu via cfe-commits
    • [cla... Joseph Huber via cfe-commits
    • [cla... Joseph Huber via cfe-commits
    • [cla... Joseph Huber via cfe-commits
    • [cla... Yaxun Liu via cfe-commits
    • [cla... Yaxun Liu via cfe-commits
    • [cla... LLVM Continuous Integration via cfe-commits
    • [cla... Valentin Clement バレンタイン クレメン via cfe-commits

Reply via email to