================ @@ -0,0 +1,888 @@ +//===- InstrProfilingPlatformROCm.cpp - Profile data ROCm platform -------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +extern "C" { +#include "InstrProfiling.h" +#include "InstrProfilingInternal.h" +#include "InstrProfilingPort.h" +} + +#include "interception/interception.h" +// C library headers (not <cstdio> etc.): clang_rt.profile is built with +// -nostdinc++ and avoids the C++ standard library (see profile/CMakeLists.txt). +#include <stddef.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#ifdef _WIN32 +#define WIN32_LEAN_AND_MEAN +#include <windows.h> +#else +#include <pthread.h> +#endif + +/* Serialize one-time HIP loader resolution and DynamicModules mutations. + * Inline to avoid a sanitizer_common dependency. */ +#ifdef _WIN32 +static INIT_ONCE HipLoadedOnce = INIT_ONCE_STATIC_INIT; +static CRITICAL_SECTION DynamicModulesLock; +static INIT_ONCE DynamicModulesLockInit = INIT_ONCE_STATIC_INIT; +static BOOL CALLBACK initDynamicModulesLockCb(PINIT_ONCE, PVOID, PVOID *) { + InitializeCriticalSection(&DynamicModulesLock); + return TRUE; +} +static void lockDynamicModules(void) { + InitOnceExecuteOnce(&DynamicModulesLockInit, initDynamicModulesLockCb, NULL, + NULL); + EnterCriticalSection(&DynamicModulesLock); +} +static void unlockDynamicModules(void) { + LeaveCriticalSection(&DynamicModulesLock); +} +#else +static pthread_once_t HipLoadedOnce = PTHREAD_ONCE_INIT; +static pthread_mutex_t DynamicModulesLock = PTHREAD_MUTEX_INITIALIZER; +static void lockDynamicModules(void) { + pthread_mutex_lock(&DynamicModulesLock); +} +static void unlockDynamicModules(void) { + pthread_mutex_unlock(&DynamicModulesLock); +} +#endif + +static int processDeviceOffloadPrf(void *DeviceOffloadPrf, int TUIndex, + const char *Target); + +static int isVerboseMode() { + static int IsVerbose = -1; + if (IsVerbose == -1) + IsVerbose = getenv("LLVM_PROFILE_VERBOSE") != nullptr; + return IsVerbose; +} + +/* -------------------------------------------------------------------------- */ +/* Dynamic loading of HIP runtime symbols */ +/* -------------------------------------------------------------------------- */ + +typedef int (*hipGetSymbolAddressTy)(void **, const void *); +typedef int (*hipMemcpyTy)(void *, const void *, size_t, int); +typedef int (*hipModuleGetGlobalTy)(void **, size_t *, void *, const char *); +typedef int (*hipGetDeviceCountTy)(int *); +typedef int (*hipGetDeviceTy)(int *); +typedef int (*hipSetDeviceTy)(int); + +/* Minimal hipDeviceProp_t (HIP 6.x R0600): only gcnArchName at offset 1160 + * is read. Padded to 4096 to tolerate ABI growth. */ +typedef struct { + char padding[1160]; + char gcnArchName[256]; + char tail_padding[2680]; +} HipDevicePropMinimal; +typedef int (*hipGetDevicePropertiesTy)(HipDevicePropMinimal *, int); + +static hipGetSymbolAddressTy pHipGetSymbolAddress = nullptr; +static hipMemcpyTy pHipMemcpy = nullptr; +static hipModuleGetGlobalTy pHipModuleGetGlobal = nullptr; +static hipGetDeviceCountTy pHipGetDeviceCount = nullptr; +static hipGetDeviceTy pHipGetDevice = nullptr; +static hipSetDeviceTy pHipSetDevice = nullptr; +static hipGetDevicePropertiesTy pHipGetDeviceProperties = nullptr; + +#define MAX_DEVICES 16 +static int NumDevices = 0; +static char DeviceArchNames[MAX_DEVICES][256]; + +/* -------------------------------------------------------------------------- */ +/* Device-to-host copies */ +/* Keep HIP-only to avoid an HSA dependency. */ +/* -------------------------------------------------------------------------- */ + +static void doEnsureHipLoaded(void) { + if (!__interception::DynamicLoaderAvailable()) { + if (isVerboseMode()) + PROF_NOTE("%s", "Dynamic library loading not available - " + "HIP profiling disabled\n"); + return; + } + +#ifdef _WIN32 + static const char HipLibName[] = "amdhip64.dll"; +#else + static const char HipLibName[] = "libamdhip64.so"; +#endif + + void *Handle = __interception::OpenLibrary(HipLibName); + if (!Handle) + return; + + pHipGetSymbolAddress = (hipGetSymbolAddressTy)__interception::LookupSymbol( + Handle, "hipGetSymbolAddress"); + pHipMemcpy = (hipMemcpyTy)__interception::LookupSymbol(Handle, "hipMemcpy"); + pHipModuleGetGlobal = (hipModuleGetGlobalTy)__interception::LookupSymbol( + Handle, "hipModuleGetGlobal"); + pHipGetDeviceCount = (hipGetDeviceCountTy)__interception::LookupSymbol( + Handle, "hipGetDeviceCount"); + pHipGetDevice = + (hipGetDeviceTy)__interception::LookupSymbol(Handle, "hipGetDevice"); + pHipSetDevice = + (hipSetDeviceTy)__interception::LookupSymbol(Handle, "hipSetDevice"); + pHipGetDeviceProperties = + (hipGetDevicePropertiesTy)__interception::LookupSymbol( + Handle, "hipGetDevicePropertiesR0600"); + if (!pHipGetDeviceProperties) + pHipGetDeviceProperties = + (hipGetDevicePropertiesTy)__interception::LookupSymbol( + Handle, "hipGetDeviceProperties"); + + if (pHipGetDeviceCount && pHipGetDeviceProperties) { + int Count = 0; + if (pHipGetDeviceCount(&Count) == 0) { + if (Count > MAX_DEVICES) + Count = MAX_DEVICES; + HipDevicePropMinimal Prop; + for (int i = 0; i < Count; ++i) { + __builtin_memset(&Prop, 0, sizeof(Prop)); + if (pHipGetDeviceProperties(&Prop, i) == 0) { + strncpy(DeviceArchNames[i], Prop.gcnArchName, + sizeof(DeviceArchNames[i]) - 1); + DeviceArchNames[i][sizeof(DeviceArchNames[i]) - 1] = '\0'; + if (isVerboseMode()) + PROF_NOTE("Device %d arch: %s\n", i, DeviceArchNames[i]); + } + } + NumDevices = Count; + } + } +} + +#ifdef _WIN32 +static BOOL CALLBACK ensureHipLoadedCb(PINIT_ONCE, PVOID, PVOID *) { + doEnsureHipLoaded(); + return TRUE; +} +#endif + +static void ensureHipLoaded(void) { +#ifdef _WIN32 + InitOnceExecuteOnce(&HipLoadedOnce, ensureHipLoadedCb, NULL, NULL); +#else + pthread_once(&HipLoadedOnce, doEnsureHipLoaded); +#endif +} + +/* -------------------------------------------------------------------------- */ +/* Public wrappers that forward to the loaded HIP symbols */ +/* -------------------------------------------------------------------------- */ + +static int hipGetSymbolAddress(void **devPtr, const void *symbol) { + ensureHipLoaded(); + return pHipGetSymbolAddress ? pHipGetSymbolAddress(devPtr, symbol) : -1; +} + +static int hipMemcpy(void *dest, const void *src, size_t len, + int kind /*2=DToH*/) { + ensureHipLoaded(); + return pHipMemcpy ? pHipMemcpy(dest, src, len, kind) : -1; +} + +/* Device section symbols must be registered with CLR first; otherwise + * hipMemcpy may take a CPU path and crash. */ +static int memcpyDeviceToHost(void *Dst, const void *Src, size_t Size) { + return hipMemcpy(Dst, Src, Size, 2 /* DToH */); +} + +static int hipModuleGetGlobal(void **DevPtr, size_t *Bytes, void *Module, + const char *Name) { + ensureHipLoaded(); + return pHipModuleGetGlobal ? pHipModuleGetGlobal(DevPtr, Bytes, Module, Name) + : -1; +} + +static int hipGetDevice(int *DeviceId) { + ensureHipLoaded(); + return pHipGetDevice ? pHipGetDevice(DeviceId) : -1; +} + +static int hipSetDevice(int DeviceId) { + ensureHipLoaded(); + return pHipSetDevice ? pHipSetDevice(DeviceId) : -1; +} + +static const char *getDeviceArchName(int DeviceId) { + if (DeviceId < 0 || DeviceId >= NumDevices || !DeviceArchNames[DeviceId][0]) + return "amdgpu"; + return DeviceArchNames[DeviceId]; +} + +/* -------------------------------------------------------------------------- */ +/* Dynamic module tracking */ +/* -------------------------------------------------------------------------- */ + +/* Per-TU profile entry inside a dynamic module. + * A single dynamic module may contain multiple TUs (e.g. -fgpu-rdc). */ +typedef struct { + void *DeviceVar; /* device address of __llvm_profile_sections_<CUID> */ + int Processed; /* 0 = not yet collected, 1 = data already copied */ +} OffloadDynamicTUInfo; + +/* One entry per hipModuleLoad call. */ +typedef struct { + void *ModulePtr; /* hipModule_t handle */ + OffloadDynamicTUInfo *TUs; /* array of per-TU entries */ + int NumTUs; + int CapTUs; +} OffloadDynamicModuleInfo; + +static OffloadDynamicModuleInfo *DynamicModules = nullptr; +static int NumDynamicModules = 0; +static int CapDynamicModules = 0; + +/* -------------------------------------------------------------------------- */ +/* ELF symbol enumeration (manual parse: compiler-rt cannot link LLVM Support) + */ +/* -------------------------------------------------------------------------- */ + +#if __has_include(<elf.h>) +#include <elf.h> + +/* Callback invoked for every matching symbol name found in the ELF image. + * Return 0 to continue iteration, non-zero to stop. */ +typedef int (*SymbolCallback)(const char *Name, void *UserData); + +/* If Image is a clang offload bundle, return a pointer to the first embedded + * ELF. Returns Image if not a bundle, nullptr if a bundle holds no ELF. */ +static const void *unwrapOffloadBundle(const void *Image) { + static const char BundleMagic[] = "__CLANG_OFFLOAD_BUNDLE__"; + if (memcmp(Image, BundleMagic, sizeof(BundleMagic) - 1) != 0) + return Image; /* Not a bundle, return as-is. */ + + const char *Buf = (const char *)Image; + uint64_t NumEntries; + __builtin_memcpy(&NumEntries, Buf + sizeof(BundleMagic) - 1, + sizeof(uint64_t)); + + /* Walk the entry table (starts at offset 32). */ + const char *Cursor = Buf + 32; + for (uint64_t I = 0; I < NumEntries; ++I) { + uint64_t EntryOffset, EntrySize, IDSize; + __builtin_memcpy(&EntryOffset, Cursor, sizeof(EntryOffset)); + Cursor += sizeof(EntryOffset); + __builtin_memcpy(&EntrySize, Cursor, sizeof(EntrySize)); + Cursor += sizeof(EntrySize); + __builtin_memcpy(&IDSize, Cursor, sizeof(IDSize)); + Cursor += sizeof(IDSize); + Cursor += IDSize; /* skip entry ID */ + + if (EntrySize >= sizeof(Elf64_Ehdr)) { + const Elf64_Ehdr *E = (const Elf64_Ehdr *)(Buf + EntryOffset); + if (E->e_ident[EI_MAG0] == ELFMAG0 && E->e_ident[EI_MAG1] == ELFMAG1 && + E->e_ident[EI_MAG2] == ELFMAG2 && E->e_ident[EI_MAG3] == ELFMAG3) { + return (const void *)(Buf + EntryOffset); + } + } + } + + PROF_WARN("%s", "offload bundle contains no valid ELF entries\n"); + return nullptr; +} + +/* Invoke CB for every global symbol in Image (an AMDGPU ELF or offload bundle) + * whose name starts with PREFIX. Image may be null. */ +static void enumerateElfSymbols(const void *Image, const char *Prefix, + SymbolCallback CB, void *UserData) { + if (!Image) + return; + + Image = unwrapOffloadBundle(Image); + if (!Image) + return; + + const Elf64_Ehdr *Ehdr = (const Elf64_Ehdr *)Image; + if (Ehdr->e_ident[EI_MAG0] != ELFMAG0 || Ehdr->e_ident[EI_MAG1] != ELFMAG1 || + Ehdr->e_ident[EI_MAG2] != ELFMAG2 || Ehdr->e_ident[EI_MAG3] != ELFMAG3) { + if (isVerboseMode()) + PROF_NOTE("%s", "Image is not a valid ELF, skipping enumeration\n"); + return; + } + + size_t PrefixLen = strlen(Prefix); + const char *Base = (const char *)Image; + const Elf64_Shdr *Shdrs = (const Elf64_Shdr *)(Base + Ehdr->e_shoff); + + for (int i = 0; i < Ehdr->e_shnum; ++i) { + if (Shdrs[i].sh_type != SHT_SYMTAB) + continue; + + const Elf64_Sym *Syms = (const Elf64_Sym *)(Base + Shdrs[i].sh_offset); + int NumSyms = Shdrs[i].sh_size / sizeof(Elf64_Sym); + /* String table is the section referenced by sh_link. */ + const char *StrTab = Base + Shdrs[Shdrs[i].sh_link].sh_offset; + + for (int j = 0; j < NumSyms; ++j) { + if (Syms[j].st_name == 0) + continue; + const char *Name = StrTab + Syms[j].st_name; + if (strncmp(Name, Prefix, PrefixLen) == 0) { + if (CB(Name, UserData)) + return; + } + } + } +} + +/* State passed through the enumeration callback. */ +typedef struct { + void *Module; /* hipModule_t */ + OffloadDynamicModuleInfo *ModInfo; +} EnumState; + +/* Register one __llvm_profile_sections_<CUID> symbol on the module entry. + * hipModuleGetGlobal also registers the device address with CLR so hipMemcpy + * can copy from it later. */ +static int registerPrfSymbol(const char *Name, void *UserData) { + EnumState *S = (EnumState *)UserData; + OffloadDynamicModuleInfo *MI = S->ModInfo; + + /* The symbol is the per-TU sections struct itself, not a pointer + * indirection, so this address is the hipMemcpy source. */ + void *DeviceVar = nullptr; + size_t Bytes = 0; + if (hipModuleGetGlobal(&DeviceVar, &Bytes, S->Module, Name) != 0) { + PROF_WARN("failed to get symbol %s for module %p\n", Name, S->Module); + return 0; /* continue */ + } + + if (MI->NumTUs >= MI->CapTUs) { + int NewCap = MI->CapTUs ? MI->CapTUs * 2 : 4; + OffloadDynamicTUInfo *New = (OffloadDynamicTUInfo *)realloc( + MI->TUs, NewCap * sizeof(OffloadDynamicTUInfo)); + if (!New) { + PROF_ERR("%s\n", "failed to grow TU array"); + return 0; + } + MI->TUs = New; + MI->CapTUs = NewCap; + } + OffloadDynamicTUInfo *TU = &MI->TUs[MI->NumTUs++]; + TU->DeviceVar = DeviceVar; + TU->Processed = 0; + + (void)Name; + return 0; /* continue enumeration */ +} + +#endif /* __has_include(<elf.h>) */ + +/* -------------------------------------------------------------------------- */ +/* Registration / un-registration helpers */ +/* -------------------------------------------------------------------------- */ + +extern "C" void +__llvm_profile_offload_register_dynamic_module(int ModuleLoadRc, void **Ptr, + const void *Image) { + if (ModuleLoadRc) + return; + + lockDynamicModules(); + + if (isVerboseMode()) + PROF_NOTE("Registering loaded module %d: rc=%d, module=%p, image=%p\n", + NumDynamicModules, ModuleLoadRc, *Ptr, Image); + + if (NumDynamicModules >= CapDynamicModules) { + int NewCap = CapDynamicModules ? CapDynamicModules * 2 : 64; + OffloadDynamicModuleInfo *New = (OffloadDynamicModuleInfo *)realloc( + DynamicModules, NewCap * sizeof(OffloadDynamicModuleInfo)); + if (!New) { + unlockDynamicModules(); + return; + } + DynamicModules = New; + CapDynamicModules = NewCap; + } + + OffloadDynamicModuleInfo *MI = &DynamicModules[NumDynamicModules++]; + MI->ModulePtr = *Ptr; + MI->TUs = nullptr; + MI->NumTUs = 0; + MI->CapTUs = 0; + + /* Dynamic-module profiling needs ELF parsing for symbol enumeration. */ +#if __has_include(<elf.h>) + EnumState State = {*Ptr, MI}; + enumerateElfSymbols(Image, "__llvm_profile_sections_", registerPrfSymbol, + &State); +#else + (void)Image; + if (isVerboseMode()) + PROF_NOTE("%s", + "Dynamic module profiling not supported on this platform\n"); +#endif + + if (MI->NumTUs == 0) { + PROF_WARN("no __llvm_profile_sections_* symbols found in module %p\n", + *Ptr); + } else if (isVerboseMode()) { + PROF_NOTE("Module %p: registered %d TU(s)\n", *Ptr, MI->NumTUs); + } + + unlockDynamicModules(); +} + +extern "C" void __llvm_profile_offload_unregister_dynamic_module(void *Ptr) { + lockDynamicModules(); + for (int i = 0; i < NumDynamicModules; ++i) { + OffloadDynamicModuleInfo *MI = &DynamicModules[i]; + + /* HIP recycles hipModule_t addresses; drained slots are cleared so a + * recycled handle finds the new slot, not the dead one. */ + if (MI->ModulePtr != Ptr) + continue; + + if (isVerboseMode()) + PROF_NOTE("Unregistering module %p (%d TUs)\n", MI->ModulePtr, + MI->NumTUs); + + for (int t = 0; t < MI->NumTUs; ++t) { + OffloadDynamicTUInfo *TU = &MI->TUs[t]; + if (TU->Processed) { + if (isVerboseMode()) + PROF_NOTE("Module %p TU %d already processed, skipping\n", Ptr, t); + continue; + } + /* Globally unique TU index for the output filename. */ + int TUIndex = i * 1000 + t; ---------------- yxsamliu wrote:
Each TU of each process has a separate profile data, which needs a distinct file name. However, as you suggested, it can be replaced by an atomic counter. will do. https://github.com/llvm/llvm-project/pull/177665 _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
