eu-stackprof is a new tool which profiles processes on a Linux system
using perf_events and outputs gprof gmon.out format program counter
histograms and callgraph-arc profiles; intended as an updated demo of
libdwfl_stacktrace functionality and as a data-gathering tool for the
profiledb initiative.

* configure.ac: Add configure checks for C++20, eu-stackprof
  perf/libpfm dependencies.
* src/Makefile.am (bin_PROGRAMS): Add stackprof.
  (stackprof_*): Add stackprof SOURCES, LDADD, and so forth.
* src/stackprof.cxx: New file.

Co-authored-by: <[email protected]>
Signed-off-by: <[email protected]>
---
 configure.ac      |   23 +-
 src/Makefile.am   |   11 +-
 src/stackprof.cxx | 2083 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 2110 insertions(+), 7 deletions(-)
 create mode 100644 src/stackprof.cxx

diff --git a/configure.ac b/configure.ac
index f22a3f90..e5be95b8 100644
--- a/configure.ac
+++ b/configure.ac
@@ -888,10 +888,21 @@ fi
 AC_CHECK_PROG(HAVE_ZSTD, zstd, yes, no)
 AM_CONDITIONAL([HAVE_ZSTD],[test "x$HAVE_ZSTD" = "xyes"])
 
-# For tests that need to use C++11
-AX_CXX_COMPILE_STDCXX(11, noext, optional)
-AS_IF([test "x$HAVE_CXX11" = "x1"], [HAVE_CXX11=yes], [HAVE_CXX11=no])
-AM_CONDITIONAL([HAVE_CXX11],[test "x$HAVE_CXX11" = "xyes"])
+# For tests that need to use C++20
+AX_CXX_COMPILE_STDCXX(20, noext, optional)
+AS_IF([test "x$HAVE_CXX20" = "x1"], [HAVE_CXX20=yes], [HAVE_CXX20=no])
+AM_CONDITIONAL([HAVE_CXX20],[test "x$HAVE_CXX20" = "xyes"])
+
+
+# For eu-stackprof
+# optional:
+AC_CHECK_HEADERS([perfmon/pfmlib_perf_event.h])
+AM_CONDITIONAL([HAVE_LIBPFM], [test 
"x${ac_cv_header_perfmon_pfmlib_perf_event_h}" = "xyes" ])
+AC_CHECK_LIB(pfm, pfm_get_os_event_encoding, [AC_SUBST(libpfm_LIBS, '-lpfm')])
+# required:
+AC_CHECK_HEADERS([linux/perf_event.h])
+AM_CONDITIONAL([ENABLE_STACKPROF],[test 
"x${ac_cv_header_linux_perf_event_h}x${HAVE_CXX20}" = "xyesxyes" ])
+
 
 AC_CHECK_HEADERS([execinfo.h])
 
@@ -941,7 +952,7 @@ AS_IF([test "x$with_libarchive" = "xyes" -a 
"x$have_libarchive" != "xyes"], [
 # pronounce judgement on ability to build server, overridden by =yes/=no
 if test "x$enable_debuginfod" = "xno"; then
    true
-elif test "x$have_jsonc$HAVE_CXX11$have_libarchive$have_sqlite3" = 
"xyesyesyesyes"; then
+elif test "x$have_jsonc$HAVE_CXX20$have_libarchive$have_sqlite3" = 
"xyesyesyesyes"; then
    enable_debuginfod=yes
 elif test "x$enable_debuginfod" = "xyes"; then
    AC_MSG_ERROR([unable to build debuginfod, missing libmicrohttpd, sqlite3 or 
libarchive])
@@ -1098,7 +1109,7 @@ AC_MSG_NOTICE([
   EXTRA TEST FEATURES (used with make check)
     have bunzip2 installed (required)  : ${HAVE_BUNZIP2}
     have zstd installed                : ${HAVE_ZSTD}
-    C++11                              : ${HAVE_CXX11}
+    C++20                              : ${HAVE_CXX20}
     debug branch prediction            : ${use_debugpred}
     gprof support                      : ${use_gprof}
     gcov support                       : ${use_gcov}
diff --git a/src/Makefile.am b/src/Makefile.am
index f041d458..f753c70c 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -35,6 +35,9 @@ bin_PROGRAMS = readelf nm size strip elflint findtextrel 
addr2line \
 if ENABLE_STACKTRACE
 bin_PROGRAMS += stacktrace
 endif
+if ENABLE_STACKPROF
+bin_PROGRAMS += stackprof
+endif
 
 noinst_LIBRARIES = libar.a
 
@@ -127,7 +130,13 @@ endif
 elfcompress_LDADD = $(libebl) $(libelf) $(libdw) $(libeu) $(argp_LDADD)
 elfclassify_LDADD = $(libelf) $(libdw) $(libeu) $(argp_LDADD)
 srcfiles_SOURCES = srcfiles.cxx
-srcfiles_LDADD = $(libdw) $(libelf) $(libeu)  $(argp_LDADD) $(libarchive_LIBS) 
$(libdebuginfod)
+srcfiles_LDADD = $(libdw) $(libelf) $(libeu) $(argp_LDADD) $(libarchive_LIBS) 
$(libdebuginfod)
+if ENABLE_STACKPROF
+stackprof_SOURCES = stackprof.cxx
+stackprof_CPPFLAGS = $(AM_CPPFLAGS) $(jsonc_CXXFLAGS)
+stackprof_CXXFLAGS = -Wall
+stackprof_LDADD = $(libebl) $(libdw) $(libelf) $(libeu) $(argp_LDADD) 
$(libpfm_LIBS) $(jsonc_LIBS)
+endif
 
 installcheck-binPROGRAMS: $(bin_PROGRAMS)
        bad=0; pid=$$$$; list="$(bin_PROGRAMS)"; for p in $$list; do \
diff --git a/src/stackprof.cxx b/src/stackprof.cxx
new file mode 100644
index 00000000..33720e80
--- /dev/null
+++ b/src/stackprof.cxx
@@ -0,0 +1,2083 @@
+/* Collect stack-trace profiles of running program(s).
+   Copyright (C) 2025-2026 Red Hat, Inc.
+   This file is part of elfutils.
+
+   This file is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   elfutils is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+#ifdef HAVE_CONFIG_H
+# include <config.h>
+#endif
+
+#include "printversion.h"
+
+#include <string>
+#include <memory>
+#include <iomanip>
+#include <map>
+#include <unordered_map>
+#include <vector>
+#include <bitset>
+#include <stdexcept>
+#include <cstring>
+#include <csignal>
+#include <cassert>
+#include <chrono>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <cinttypes>
+#include <format>
+#include <filesystem>
+
+#include <sys/utsname.h>
+
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/wait.h>
+#include <poll.h>
+#ifdef HAVE_LINUX_PERF_EVENT_H
+#include <linux/perf_event.h>
+#endif
+#include <argp.h>
+#include <fcntl.h>
+#include <dirent.h>
+
+#include <system.h>
+
+#ifdef HAVE_PERFMON_PFMLIB_PERF_EVENT_H
+#include <perfmon/pfmlib_perf_event.h>
+#endif
+
+#include <json-c/json.h>
+
+#include <gelf.h>
+#include <dwarf.h>
+#include <libdwfl.h>
+#include <libdw.h>
+#include "../libebl/libebl.h"
+#include "../libdwfl_stacktrace/libdwfl_stacktrace.h"
+
+using namespace std; // so we don't have to std:: prefix everything in here
+
+////////////////////////////////////////////////////////////////////////
+// find_debuginfo callbacks
+
+#ifdef FIND_DEBUGINFO
+
+static char *debuginfo_path = NULL;
+
+static const Dwfl_Callbacks dwfl_cfi_callbacks =
+  {
+    .find_elf = dwflst_tracker_linux_proc_find_elf,
+    .find_debuginfo = dwfl_standard_find_debuginfo,
+    .debuginfo_path = &debuginfo_path,
+  };
+
+#else
+
+int
+nop_find_debuginfo (Dwfl_Module *mod __attribute__((unused)),
+                   void **userdata __attribute__((unused)),
+                   const char *modname __attribute__((unused)),
+                   GElf_Addr base __attribute__((unused)),
+                   const char *file_name __attribute__((unused)),
+                   const char *debuglink_file __attribute__((unused)),
+                   GElf_Word debuglink_crc __attribute__((unused)),
+                   char **debuginfo_file_name __attribute__((unused)))
+{
+#ifdef DEBUG_MODULES
+  cerr << format("nop_find_debuginfo: modname={} file_name={} 
debuglink_file={}\n", modname, file_name, debuglink_file);
+#endif
+  return -1;
+}
+
+static const Dwfl_Callbacks dwfl_cfi_callbacks =
+{
+  .find_elf = dwflst_tracker_linux_proc_find_elf,
+  .find_debuginfo = nop_find_debuginfo, /* work with CFI only */
+};
+
+#endif /* FIND_DEBUGINFO */
+
+
+////////////////////////////////////////////////////////////////////////
+// class decls
+
+// Unwind statistics for a Dwfl and associated process.
+struct UnwindDwflStats {
+  Dwfl *dwfl;
+  string comm;
+  int max_frames; /* for diagnostic purposes */
+  int total_samples; /* for diagnostic purposes */
+  int lost_samples; /* for diagnostic purposes */
+  int shown_errors; /* for diagnostic purposes */
+  Dwfl_Unwound_Source last_unwound; /* track CFI source, for diagnostic 
purposes */
+  Dwfl_Unwound_Source worst_unwound; /* track CFI source, for diagnostic 
purposes */
+};
+
+struct hash_arc {
+  template <class T1, class T2>
+  size_t operator()(const pair<T1, T2> &p) const {
+    return hash<T1>()(p.first) ^ hash<T2>()(p.second);
+  }
+};
+
+// Unwind statistics for a single module identified by build-id.
+struct UnwindModuleStats {
+  map<uint64_t, uint32_t> histogram; /* sorted by pc */
+  unordered_map<pair<uint64_t, uint64_t>, uint32_t, hash_arc> callgraph;
+
+  void record_pc(Dwarf_Addr pc) {
+    if (histogram.count(pc) == 0)
+      histogram[pc]=1;
+    else
+      histogram[pc]++;
+  }
+  void record_callgraph_arc(Dwarf_Addr from, Dwarf_Addr to) {
+    pair<uint64_t, uint64_t> arc(from, to);
+    if (callgraph.count(arc) == 0)
+      callgraph[arc]=1;
+    else
+      callgraph[arc]++;
+  }
+};
+
+struct UnwindStatsTable
+{
+  unordered_map<pid_t, UnwindDwflStats> dwfl_tab;
+  unordered_map<string, UnwindModuleStats> buildid_tab;
+  typedef map<string, UnwindModuleStats> buildid_map_t;
+
+  UnwindStatsTable () {}
+  ~UnwindStatsTable () {}
+
+  UnwindDwflStats *pid_find_or_create(pid_t pid);
+  string pid_find_comm(pid_t pid);
+  Dwfl *pid_find_dwfl(pid_t pid);
+  void pid_store_dwfl(pid_t pid, Dwfl *dwfl);
+
+  UnwindModuleStats *buildid_find(string buildid);
+  UnwindModuleStats *buildid_find_or_create(string buildid, Dwfl_Module *mod);
+
+  void print_summary() const;
+};
+
+class PerfConsumer;
+
+// A PerfReader creates perf_events file descriptors, monitors the
+// mmap'd ring buffers for events, and dispatches decoded forms to a
+// PerfConsumer.
+class PerfReader
+{
+private:
+  /* Sized by number of CPUs or threads: */
+  vector<int> perf_fds;
+  vector<perf_event_mmap_page *> perf_headers;
+  vector<pollfd> pollfds;
+
+  PerfConsumer* consumer; // pluralize!
+  Ebl* default_ebl;
+  uint64_t sample_regs_user;
+  int sample_regs_count;
+  bool enabled;
+  int page_size;
+  int page_count;
+  int mmap_size;
+  vector<uint8_t> event_wraparound_temp; // for events straddling ring buffer 
end
+
+  void decode_event(const perf_event_header* ehdr);
+
+public:
+  // PerfReader(perf_event_attr* attr, int pid, PerfConsumer* consumer); // 
attach to process hierarchy; may modify *attr
+  PerfReader(perf_event_attr* attr, PerfConsumer* consumer, int pid=-1);       
   // systemwide; may modify *attr
+
+  ~PerfReader();
+
+  void process_some(); // run briefly, relay decoded perf_events to consumer
+  uint64_t regs_mask() { return this->sample_regs_user; }
+  Ebl *ebl() { return this->default_ebl; }
+};
+
+// A PerfConsumer receives both raw and decoded (fields split out into 
function parameters)
+// perf event records from a PerfReader.  Pure interface.
+class PerfConsumer
+{
+protected:
+  PerfReader *reader; /* access sample_regs_user etc. metadata */
+
+public:
+  PerfConsumer() {}
+  PerfConsumer(PerfReader *reader) : reader(reader) {}
+  void set_reader(PerfReader *reader) { this->reader = reader; }
+
+  virtual ~PerfConsumer() {}
+  virtual void process(const perf_event_header* sample) {}
+
+  virtual void process_comm(const perf_event_header* sample,
+                           uint32_t pid, uint32_t tid, bool exec, const 
string& comm) {}
+  virtual void process_exit(const perf_event_header* sample,
+                           uint32_t pid, uint32_t ppid,
+                           uint32_t tid, uint32_t ptid) {}
+  virtual void process_fork(const perf_event_header* sample,
+                           uint32_t pid, uint32_t ppid,
+                           uint32_t tid, uint32_t ptid) {}
+  virtual void process_sample(const perf_event_header* sample,
+                             uint64_t ip,
+                             uint32_t pid, uint32_t tid,
+                             uint64_t time,
+                             uint64_t abi,
+                             uint32_t nregs, const uint64_t *regs,
+                             uint64_t data_size, const uint8_t *data) {}
+  virtual void process_mmap2(const perf_event_header* sample,
+                            uint32_t pid, uint32_t tid,
+                            uint64_t addr, uint64_t len, uint64_t pgoff,
+                            uint8_t build_id_size, const uint8_t *build_id,
+                            const char *filename) {}
+};
+
+// A StatsPerfConsumer is a toy concrete object that accepts decoded
+// perf events and logs and records basic stats about them.
+class StatsPerfConsumer: public PerfConsumer
+{
+  unordered_map<int,unsigned> event_type_counts;
+
+public:
+  StatsPerfConsumer() {}
+  ~StatsPerfConsumer(); // report to stdout
+  void process_comm(const perf_event_header* sample,
+                   uint32_t pid, uint32_t tid, bool exec, const string& comm);
+  void process_exit(const perf_event_header* sample,
+                           uint32_t pid, uint32_t ppid,
+                   uint32_t tid, uint32_t ptid);
+  void process_fork(const perf_event_header* sample,
+                           uint32_t pid, uint32_t ppid,
+                   uint32_t tid, uint32_t ptid);
+  void process_sample(const perf_event_header* sample,
+                             uint64_t ip,
+                             uint32_t pid, uint32_t tid,
+                             uint64_t time,
+                             uint64_t abi,
+                             uint32_t nregs, const uint64_t *regs,
+                     uint64_t data_size, const uint8_t *data);
+  void process_mmap2(const perf_event_header* sample,
+                            uint32_t pid, uint32_t tid,
+                            uint64_t addr, uint64_t len, uint64_t pgoff,
+                            uint8_t build_id_size, const uint8_t *build_id,
+                    const char *filename);
+  void process(const perf_event_header* sample);
+};
+
+// An UnwindSample records an unwound call stack from a perf-event
+// sample.
+struct UnwindSample
+{
+  const perf_event_header *event;
+  Dwfl *dwfl;
+  uint32_t pid, tid;
+  vector<Dwarf_Addr> addrs;
+  int elfclass;
+
+  Dwarf_Addr base; /* for diagnostic purposes */
+  Dwarf_Addr sp; /* for diagnostic purposes */
+};
+
+class UnwindSampleConsumer;
+
+// A PerfConsumerUnwinder accepts decoded perf events, and produces
+// UnwindSample objects from them for relaying to an
+// UnwindSampleConsumer.
+class PerfConsumerUnwinder: public PerfConsumer
+{
+  UnwindSampleConsumer *consumer;
+  UnwindSample last_us; // XXX: why & is this safe to hang onto?
+  Dwflst_Process_Tracker *tracker;
+  UnwindStatsTable *stats;
+  unsigned maxframes;
+
+  int find_procfile(Dwfl *dwfl, pid_t *pid, Elf **elf, int *elf_fd);
+  Dwfl *find_dwfl(pid_t pid, const uint64_t *regs, uint32_t nregs,
+                 Elf **elf, bool *cached);
+
+  int get_sp_reg(bool is_abi32);
+
+public:
+  PerfConsumerUnwinder(UnwindSampleConsumer* usc, UnwindStatsTable *ust);
+  PerfConsumerUnwinder(UnwindSampleConsumer* usc, UnwindStatsTable *ust, 
PerfReader *reader);
+  ~PerfConsumerUnwinder();
+
+  /* libdwfl{st} callbacks */
+  Dwfl *init_dwfl(pid_t pid);
+  int unwind_frame_cb(Dwfl_Frame *state);
+
+  void process_comm(const perf_event_header* sample,
+                   uint32_t pid, uint32_t tid, bool exec, const string& comm);
+  void process_exit(const perf_event_header* sample,
+                   uint32_t pid, uint32_t ppid,
+                   uint32_t tid, uint32_t ptid);
+  void process_fork(const perf_event_header* sample,
+                   uint32_t pid, uint32_t ppid,
+                   uint32_t tid, uint32_t ptid);
+  void process_sample(const perf_event_header* sample,
+                     uint64_t ip,
+                     uint32_t pid, uint32_t tid,
+                     uint64_t time,
+                     uint64_t abi,
+                     uint32_t nregs, const uint64_t *regs,
+                     uint64_t data_size, const uint8_t *data);
+  void process_mmap2(const perf_event_header* sample,
+                    uint32_t pid, uint32_t tid,
+                    uint64_t addr, uint64_t len, uint64_t pgoff,
+                    uint8_t build_id_size, const uint8_t *build_id,
+                    const char *filename);
+};
+
+// An UnwindSampleConsumer receives an UnwindSample from a 
PerfConsumerUnwinder.
+// Pure abstract.
+class UnwindSampleConsumer
+{
+public:
+  UnwindSampleConsumer() {}
+  virtual ~UnwindSampleConsumer() {}
+  virtual void process(const UnwindSample* sample) = 0;
+  virtual int maxframes() = 0;
+};
+
+
+// An UnwindStatsConsumer is a toy that just collects statistics about
+// a received stream of UnwindSamples.
+class UnwindStatsConsumer: public UnwindSampleConsumer
+{
+  UnwindStatsTable *stats;
+
+public:
+  UnwindStatsConsumer(UnwindStatsTable *usc) : stats(usc) {}
+  ~UnwindStatsConsumer();
+  void process(const UnwindSample* sample);
+  int maxframes();
+};
+
+
+// An GprofUnwindSampleConsumer instance consumes UnwindSamples and tabulates
+// them by buildid, for eventual writing out into gmon.out format files.
+class GprofUnwindSampleConsumer: public UnwindSampleConsumer
+{
+  UnwindStatsTable *stats;
+  unordered_map<string, string> buildid_to_mainfile;
+  unordered_map<string, string> buildid_to_debugfile;
+  void record_gmon_hist(ostream &of, map<uint64_t, uint32_t> &histogram, 
uint64_t low_pc, uint64_t high_pc, uint64_t alignment);
+
+public:
+  GprofUnwindSampleConsumer(UnwindStatsTable *usc) : stats(usc) {}
+  ~GprofUnwindSampleConsumer(); // write out all the gmon.$BUILDID.out files
+  void record_gmon_out(const string& buildid, UnwindModuleStats& m); // write 
out one gmon.$BUILDID.out file
+  void process(const UnwindSample* sample); // accumulate hits / callgraph 
edges (need maxdepth=1 only)
+  int maxframes();
+};
+
+// hypothetical: FlamegraphUnwindSampleConsumer, taking in a bigger maxdepth
+// hypothetical: PprofUnwindSampleConsumer, https://github.com/google/pprof
+
+
+////////////////////////////////////////////////////////////////////////
+// command line parsing and main()
+
+/* Name and version of program.  */
+ARGP_PROGRAM_VERSION_HOOK_DEF = print_version;
+
+/* Bug report address.  */
+ARGP_PROGRAM_BUG_ADDRESS_DEF = PACKAGE_BUGREPORT;
+
+#define HIST_SPLIT_OPTS "none/even/flex"
+
+/* Definitions of arguments for argp functions.  */
+static const struct argp_option options[] =
+{
+  { NULL, 0, NULL, OPTION_DOC, N_("Output options:"), 1 },
+  { "verbose", 'v', NULL, 0, N_ ("Increase verbosity of logging messages 
(modules/samples/frames/more)."), 0 },
+  /* TODO: Add "quiet" option suppressing summary table. */
+  { "gmon", 'g', NULL, 0, N_("Generate gmon.BUILDID.out files for each 
binary."), 0 },
+  { "hist-split",'G', HIST_SPLIT_OPTS, 0, N_("Histogram splitting method for 
gmon, default 'even'."), 0 },
+  { "maxframes", 'n', "MAXFRAMES", 0, N_("Maximum number of frames to unwind, 
default 1 with --gmon, 256 otherwise."), 0 }, /* TODO */
+  { "output", 'o', "DIR", 0, N_("Output directory for gmon files."), 0 },
+  { "force", 'f', NULL, 0, N_("Unlink output files to force writing as new."), 
0 },
+  { "pid", 'p', "PID", 0, N_("Profile given PID, and its future children."), 0 
},
+#ifdef HAVE_PERFMON_PFMLIB_PERF_EVENT_H
+  { "event", 'e', "EVENT", 0, N_("Sample given LIBPFM event specification."), 
0 },
+#define ARGP_KEY_EVENT_LIST 0x1000
+  { "event-list", ARGP_KEY_EVENT_LIST, NULL, 0, N_("Sample given LIBPFM event 
specification."), 0 },
+#endif
+  { NULL, 0, NULL, 0, NULL, 0 }
+};
+
+static error_t parse_opt (int key, char *arg, struct argp_state *state);
+static const struct argp argp =
+  {
+    options, parse_opt, "[--] [CMD]...", N_("Collect systemwide stack-trace 
profiles."),
+    NULL, NULL, NULL
+  };
+
+// How to divide the program counter histograms in gmon output:
+enum hist_split_method {
+  HIST_SPLIT_NONE = 0, /* one histogram for the entire executable */
+  HIST_SPLIT_EVEN = 1, /* all histograms the same size */
+  HIST_SPLIT_FLEX = 2, /* variable-size histograms */
+};
+
+// Globals set based on command line options:
+static unsigned verbose;
+static bool gmon;
+static hist_split_method gmon_hist_split = HIST_SPLIT_EVEN;
+static string output_dir = ".";
+static bool output_force = false; // overwrite preexisting output files?
+static int pid;
+static int opt_maxframes = -1; // set to >= 0 to override default maxframes in 
consumer
+static string libpfm_event;
+static string libpfm_event_decoded;
+static perf_event_attr attr;
+static bool branch_record = false; // using accurate branch recording for 
call-graph arcs rather than backtrace heuristics
+
+// Verbosity categories:
+static bool show_summary = true; /* XXX could suppress with --quiet */
+static bool show_modules = false; /* -> first sample for each module */
+static bool show_samples = false; /* -> every sample */
+static bool show_frames = false;
+static bool show_debugfile = false;
+static bool show_tmi = false; /* -> perf, cfi details */
+
+static error_t
+parse_opt (int key, char *arg, struct argp_state *state)
+{
+  (void)state;
+
+  switch (key)
+    {
+    case ARGP_KEY_INIT:
+      break;
+
+    case 'v':
+      verbose ++;
+      break;
+
+    case 'g':
+      gmon = true;
+      break;
+
+    case 'G':
+      gmon = true; /* Automatically enable gmon mode if they set a gmon 
option. */
+      if (std::string_view(arg) == "none")
+       gmon_hist_split = HIST_SPLIT_NONE;
+      else if (std::string_view(arg) == "even")
+       gmon_hist_split = HIST_SPLIT_EVEN;
+      else if (std::string_view(arg) == "flex")
+       gmon_hist_split = HIST_SPLIT_FLEX;
+      break;
+
+    case 'o':
+      gmon = true;
+      output_dir = arg;
+      break;
+
+    case 'p':
+      pid = atoi(arg);
+      break;
+
+    case 'n':
+      opt_maxframes = atoi(arg);
+      if (opt_maxframes < 0)
+       {
+         argp_error (state, N_("-n MAXFRAMES should be 0 or higher."));
+         return EINVAL;
+       }
+      break;
+
+    case 'f':
+      output_force = true;
+      break;
+
+#ifdef HAVE_PERFMON_PFMLIB_PERF_EVENT_H
+    case 'e':
+      libpfm_event = arg;
+      break;
+
+    case ARGP_KEY_EVENT_LIST:
+      {
+       pfm_pmu_info_t pinfo;
+       pfm_event_info_t info;
+
+       pfm_err_t rc = pfm_initialize();
+       if (rc != PFM_SUCCESS)
+         {
+           cerr << format("ERROR: pfm_initialized failed: {}\n", 
pfm_strerror(rc));
+           exit(1);
+         }
+
+       memset(&pinfo, 0, sizeof(pinfo));
+       memset(&info, 0, sizeof(info));
+       pinfo.size = sizeof(pinfo);
+       info.size = sizeof(info);
+
+       for(int j= PFM_PMU_NONE ; j< PFM_PMU_MAX; j++)
+         {
+           pfm_err_t ret = pfm_get_pmu_info((pfm_pmu_t) j, &pinfo);
+           if (ret != PFM_SUCCESS)
+             continue;
+           if (! pinfo.is_present)
+             continue;
+           for (int i = pinfo.first_event; i != -1; i = pfm_get_event_next(i))
+             {
+               ret = pfm_get_event_info(i, PFM_OS_PERF_EVENT_EXT, &info);
+               if (ret == PFM_SUCCESS)
+                 clog << format("{}::{}\n", pinfo.name, info.name);
+             }
+         }
+      }
+      exit(0);
+#endif
+
+    default:
+      return ARGP_ERR_UNKNOWN;
+    }
+  return 0;
+}
+
+sig_atomic_t interrupted;
+
+void sigint_handler(int sig)
+{
+  interrupted ++;
+  if (interrupted > 1)
+    _exit(1);
+}
+
+int
+main (int argc, char *argv[])
+{
+  int remaining;
+  int pipefd[2] = {-1, -1}; // for CMD child process post-fork sync
+  bool has_cmd = false;
+  (void) argp_parse (&argp, argc, argv, 0, &remaining, NULL);
+
+  /* show_summary is true by default */
+  if (verbose > 0) show_modules = true;
+  if (verbose > 1) show_samples = true;
+  if (verbose > 2) show_frames = true;
+  if (verbose > 3) show_debugfile = true;
+  if (verbose > 4) show_tmi = true;
+
+  if (pid > 0 && remaining < argc) // got a pid AND a cmd? reject
+    {
+      cerr << format("ERROR: Must not specify both -p PID and CMD\n");
+      exit(1);
+    }
+
+  bool systemwide = (pid == 0) || (remaining == argc);
+  (void) systemwide;
+
+  try
+    {
+      memset(&attr, 0, sizeof(attr));
+      attr.size = sizeof(attr);
+
+      if (libpfm_event != "")
+       {
+#if HAVE_PERFMON_PFMLIB_PERF_EVENT_H
+         pfm_err_t rc = pfm_initialize();
+         if (rc != PFM_SUCCESS)
+           {
+             cerr << format("ERROR: pfm_initialized failed: {}\n", 
pfm_strerror(rc));
+             exit(1);
+           }
+         char* fstr = nullptr;
+         pfm_perf_encode_arg_t arg = { .attr = &attr, .fstr=&fstr, .size = 
sizeof(arg) };
+         rc = pfm_get_os_event_encoding(libpfm_event.c_str(),
+                                        PFM_PLM3, /* userspace, whether 
systemwide or not */
+                                        PFM_OS_PERF_EVENT_EXT, &arg);
+         if (rc != PFM_SUCCESS)
+           {
+             cerr << format("ERROR: pfm_get_os_event_encoding failed: {}\n", 
pfm_strerror(rc));
+             exit(1);
+           }
+         if (verbose)
+           {
+             clog << format("libpfm expanded {} to {}\n", libpfm_event, fstr);
+           }
+         libpfm_event_decoded = fstr; // overwrite
+         free(fstr);
+#endif
+       }
+      else
+       {
+         // same as: -e perf::CPU-CLOCK:freq=1000
+         attr.type = PERF_TYPE_SOFTWARE;
+         attr.config = PERF_COUNT_SW_CPU_CLOCK;
+         attr.sample_freq = 1000;
+         attr.freq = 1;
+         attr.exclude_kernel = 1;
+         attr.exclude_hv = 1;
+         attr.exclude_guest = 1;
+       }
+
+      if (show_summary)
+       {
+         clog << format("perf_event_attr configuration type={:x} config={:x} 
{}{}\n",
+                             attr.type, attr.config,
+                             (attr.freq ? "sample_freq=" : "sample_period="),
+                             (attr.freq ? attr.sample_freq : 
attr.sample_period));
+         clog << endl;
+       }
+
+      if (remaining < argc) // got a CMD... suffix?  ok start it
+       {
+         has_cmd = true;
+         int rc = pipe (pipefd); // will use pipefd[] >= 0 as flag for 
synchronization just below
+         if (rc < 0)
+           {
+             cerr << format("ERROR: pipe failed: {}\n", strerror(errno));
+             exit(1);
+           }
+
+         pid = fork();
+         if (pid == 0) // in child
+           {
+             close (pipefd[1]); // close write end
+             char dummy;
+             int rc = read (pipefd[0], &dummy, 1); // block until parent is 
ready
+             if (rc != 1)
+               {
+                 cerr << format("ERROR: child sync read failed: {}\n", 
strerror(errno));
+                 exit(1);
+               }
+             close (pipefd[0]);
+             execvp (argv[remaining], & argv[remaining] /* not +1: child 
argv[0] included! */ );
+             // notreached unless error
+             cerr << format("ERROR: execvp failed: {}\n", strerror(errno));
+             exit(1);
+           }
+         else if (pid > 0) // in parent
+           {
+             close (pipefd[0]); // close read end
+             // will write to pipefd[1] after perfreader sicced at child
+           }
+         else // error
+           {
+             cerr << format("ERROR: fork failed: {}\n", strerror(errno));
+             exit(1);
+           }
+       }
+
+      // Create the perf processing pipeline as per command line options
+      PerfReader *pr = nullptr;
+      UnwindStatsTable *tab = nullptr;
+      UnwindSampleConsumer *usc = nullptr;
+      PerfConsumerUnwinder *pcu = nullptr;
+      StatsPerfConsumer *spc = nullptr;
+
+      if (gmon)
+       {
+         tab = new UnwindStatsTable();
+         usc = new GprofUnwindSampleConsumer(tab);
+         pcu = new PerfConsumerUnwinder(usc, tab);
+         pr = new PerfReader(&attr, pcu, pid);
+       }
+      else
+       {
+         tab = new UnwindStatsTable();
+         usc = new UnwindStatsConsumer(tab);
+         pcu = new PerfConsumerUnwinder (usc, tab);
+         pr = new PerfReader(&attr, pcu, pid);
+#if 0
+         spc = new StatsPerfConsumer();
+         pr = new PerfReader(&attr, spc, pid);
+#endif
+       }
+
+      signal(SIGINT, sigint_handler);
+      signal(SIGTERM, sigint_handler);
+
+      if (pid > 0 && has_cmd) // need to release child CMD process?
+       {
+         int rc = write(pipefd[1], "x", 1); // unblock child
+         assert (rc == 1);
+         close(pipefd[1]);
+       }
+
+      if (show_summary)
+       {
+         clog << "Starting stack profile collection ";
+         if (pid) clog << format("pid {}", pid);
+         else clog << "systemwide";
+         clog << "\n";
+       }
+
+      while (true) // main loop
+       {
+         if (interrupted) break;
+         if (pid > 0) waitpid(pid, NULL, WNOHANG); // reap dead child to allow 
kill(pid, 0) to signal death
+         if (pid > 0 && kill(pid, 0) != 0) break; // exit if child or targeted 
non-child process died
+         pr->process_some();
+       }
+
+      delete pr;
+      delete usc;
+      delete pcu;
+      delete spc;
+      delete tab;
+
+      // reporting done in various destructors
+    }
+  catch (const exception& e)
+    {
+      cerr << format("{}\n", e.what());
+    }
+
+  return 0;
+}
+
+
+////////////////////////////////////////////////////////////////////////
+// perf reader
+
+PerfReader::PerfReader(perf_event_attr* attr, PerfConsumer* consumer, int pid)
+{
+  this->page_size = getpagesize();
+  this->page_count = 64; /* XXX May want to verify if this is a large-enough 
power-of-2.  */
+  this->mmap_size = this->page_size * (this->page_count + 1); // total mmap 
size, incl header page
+  this->event_wraparound_temp.resize(this->mmap_size); // NB: never resize 
this object again!
+  this->consumer = consumer;
+  this->consumer->set_reader(this);
+  this->enabled = false;
+
+  struct utsname u;
+  uname(&u);
+  int em = EM_NONE;
+  std::string_view machine = u.machine;
+  if (machine == "x86_64") em = EM_X86_64;
+  else if (machine == "i686" || machine == "i386") em = EM_386;
+  else if (machine == "aarch64" || machine == "armv7l") em = EM_ARM;
+  else {
+    cerr << format("ERROR: Unsupported architecture: {}\n", u.machine);
+    exit(1);
+  }
+  this->default_ebl = ebl_openbackend_machine(em);
+  this->sample_regs_user = ebl_perf_frame_regs_mask (this->default_ebl);
+  this->sample_regs_count = bitset<64>(this->sample_regs_user).count();
+
+  attr->sample_regs_user = this->sample_regs_user;
+  attr->sample_stack_user = 8192; // enough?
+  attr->sample_type = (PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_TIME);
+  attr->sample_type |= PERF_SAMPLE_REGS_USER;
+  attr->sample_type |= PERF_SAMPLE_STACK_USER;
+  // XXX Maybe: ask for PERF_SAMPLE_CALLCHAIN, in case kernel can
+  // unwind for us?  Would want an option to control this, to allow
+  // eu-stackprof to exercise our own unwinding functionality when
+  // testing.
+  attr->mmap = 1;
+  attr->mmap2 = 1;
+  attr->exclude_kernel = 1; /* in-kernel unwinding not relevant for our 
usecase */
+  attr->disabled = 1; /* will get enabled soon */
+  attr->task = 1; // catch FORK/EXIT
+  attr->comm = 1; // catch EXEC
+  attr->comm_exec = 1; // catch EXEC
+  // attr->precise_ip = 2; // request 0 skid ... but that conflicts with 
PERF_COUNT_HW_BRANCH_INSTRUCTIONS:freq=4000
+  attr->build_id = 1; // request build ids in MMAP2 events
+
+  if (pid > 0) // actually only once, to allow break in case of error
+    attr->inherit = 1; // propagate to child processes
+
+
+  if (show_tmi)
+    { // hexdump attr
+      clog << "perf_event_attr hexdump: ";
+      auto bytes = (unsigned char*) attr;
+      for (size_t x = 0; x<sizeof(*attr); x++)
+       clog << ((x % 8) ? "" : " ")
+            << ((x % 32) ? "" : "\n")
+            << format("{:02x}", (unsigned)bytes[x]);
+      clog << "\n";
+    }
+
+  // Iterate over all cpus, even if attaching to a single pid, because
+  // we set ->inherit=1.  That requires possible concurrency, which is
+  // enabled by per-cpu ring buffers.
+  int ncpus = sysconf(_SC_NPROCESSORS_CONF);
+  for (int cpu=0; cpu<ncpus; cpu++)
+    {
+      int fd = syscall(__NR_perf_event_open, attr,
+                      (pid > 0 ? pid : -1), cpu, -1,
+                      PERF_FLAG_FD_CLOEXEC);
+      if (fd < 0)
+       {
+         cerr << format("WARNING: unable to open perf event for cpu {}: {}\n", 
cpu, strerror(errno));
+         continue;
+       }
+      void *buf = mmap(NULL, this->mmap_size, PROT_READ | PROT_WRITE, 
MAP_SHARED, fd, 0);
+      if (buf == MAP_FAILED)
+       {
+         cerr << format("ERROR: perf event mmap failed: {}\n", 
strerror(errno));
+         close(fd);
+         continue;
+       }
+      this->perf_fds.push_back(fd);
+      this->perf_headers.push_back((perf_event_mmap_page*) buf);
+      struct pollfd pfd = {.fd = fd, .events=POLLIN};
+      this->pollfds.push_back(pfd);
+    }
+
+  if (this->perf_fds.size() == 0)
+    throw runtime_error("ERROR: no perf events opened");
+}
+
+PerfReader::~PerfReader()
+{
+  for (auto fd : this->perf_fds)
+    close(fd);
+  for (auto m : this->perf_headers)
+    munmap((void*) m, this->mmap_size);
+  ebl_closebackend (this->default_ebl);
+}
+
+uint64_t millis_monotonic()
+{
+  return 
chrono::duration_cast<chrono::milliseconds>(chrono::steady_clock::now().time_since_epoch()).count();
+}
+
+static inline uint64_t
+ring_buffer_read_head(volatile struct perf_event_mmap_page *base)
+{
+  uint64_t head = base->data_head;
+  asm volatile("" ::: "memory"); // memory fence
+  return head;
+}
+
+static inline void
+ring_buffer_write_tail(volatile struct perf_event_mmap_page *base,
+                      uint64_t tail)
+{
+  asm volatile("" ::: "memory"); // memory fence
+  base->data_tail = tail;
+}
+
+void PerfReader::process_some()
+{
+  if (! this->enabled)
+    {
+      for (auto fd : this->perf_fds)
+       ioctl(fd, PERF_EVENT_IOC_ENABLE, 0 /* value ignored */);
+      this->enabled = true;
+    }
+
+  uint64_t starttime = millis_monotonic();
+  uint64_t endtime = starttime + 1000; // run at most one second
+  uint64_t ring_buffer_size = this->page_size * this->page_count; // just the 
ring buffer size
+
+  while (! interrupted)
+    {
+      uint64_t now = millis_monotonic();
+      if (endtime < now)
+       break;
+      int ready = poll(this->pollfds.data(), this->pollfds.size(), 
(int)(endtime-now)); // wait a little while
+      if (ready < 0)
+       break;
+
+      for (size_t i = 0; i < pollfds.size(); i++)
+       if (this->pollfds[i].revents & POLLIN) // found an fd with fresh yummy 
events
+         {
+           perf_event_mmap_page *header = perf_headers[i];
+           uint64_t data_head = ring_buffer_read_head(header);
+           uint64_t data_tail = header->data_tail;
+           uint8_t *base = ((uint8_t *) header) + this->page_size;
+           struct perf_event_header *ehdr;
+           size_t ehdr_size;
+
+           while (data_head != data_tail) // consume all packets in ring 
buffer XXX why?
+             {
+               ehdr = (perf_event_header*) (base + (data_tail & 
(ring_buffer_size - 1)));
+               ehdr_size = ehdr->size;
+               if (show_tmi)
+                 clog << format("perf head={:p} tail={:p} ehdr={:p} 
size={:d}{:x}\n",
+                                     (void*) data_head, (void*) data_tail, 
(void*) ehdr, ehdr_size, 0);
+
+               if (((uint8_t *)ehdr) + ehdr_size > base + ring_buffer_size) // 
mmap region wraparound?
+                 {
+                   // need to copy it to a contiguous temporary
+                   uint8_t *copy_start = (uint8_t*) ehdr;
+                   size_t len_first = base + ring_buffer_size - copy_start;
+                   size_t len_secnd = ehdr_size - len_first;
+                   uint8_t *event_temp = this->event_wraparound_temp.data();
+                   memcpy(event_temp, copy_start, len_first);       // part at 
end of mmap'd region
+                   memcpy(event_temp + len_first, base, len_secnd); // part at 
beginning of mmap'd region
+                   ehdr = (perf_event_header*) event_temp;
+                 }
+
+               this->decode_event(ehdr);
+               data_tail += ehdr_size;
+             }
+
+           ring_buffer_write_tail(header, data_tail);
+         }
+    }
+}
+
+void PerfReader::decode_event(const perf_event_header* ehdr)
+{
+  consumer->process(ehdr); // allow general processing
+
+  // and decode into individual event types
+  switch (ehdr->type)
+    {
+    case PERF_RECORD_SAMPLE:
+      {
+       const uint8_t* data = reinterpret_cast<const uint8_t*>(ehdr) + 
sizeof(perf_event_header);
+       uint64_t ip = *reinterpret_cast<const uint64_t*>(data); data += 
sizeof(uint64_t);
+       uint32_t pid = *reinterpret_cast<const uint32_t*>(data); data += 
sizeof(uint32_t);
+       uint32_t tid = *reinterpret_cast<const uint32_t*>(data); data += 
sizeof(uint32_t);
+       uint64_t time = *reinterpret_cast<const uint64_t*>(data); data += 
sizeof(uint64_t);
+       // PERF_SAMPLE_CALLCHAIN would be here if requested
+       uint64_t abi = *reinterpret_cast<const uint64_t*>(data); data += 
sizeof(uint64_t);
+       uint32_t nregs = this->sample_regs_count;
+       const uint64_t* regs = reinterpret_cast<const uint64_t*>(data); data += 
nregs * sizeof(uint64_t);
+       uint64_t data_size = *reinterpret_cast<const uint64_t*>(data); data += 
sizeof(uint64_t);
+       const uint8_t* stack_data = data;
+       consumer->process_sample(ehdr, ip, pid, tid, time, abi, nregs, regs, 
data_size, stack_data);
+       break;
+      }
+    case PERF_RECORD_COMM:
+      {
+       const uint8_t* data = reinterpret_cast<const uint8_t*>(ehdr) + 
sizeof(perf_event_header);
+       uint32_t pid = *reinterpret_cast<const uint32_t*>(data); data += 
sizeof(uint32_t);
+       uint32_t tid = *reinterpret_cast<const uint32_t*>(data); data += 
sizeof(uint32_t);
+       const char* comm = reinterpret_cast<const char*>(data);
+       consumer->process_comm(ehdr, pid, tid, (ehdr->misc & 
PERF_RECORD_MISC_COMM_EXEC), comm);
+       break;
+      }
+    case PERF_RECORD_EXIT:
+      {
+       const uint8_t* data = reinterpret_cast<const uint8_t*>(ehdr) + 
sizeof(perf_event_header);
+       uint32_t pid = *reinterpret_cast<const uint32_t*>(data); data += 
sizeof(uint32_t);
+       uint32_t ppid = *reinterpret_cast<const uint32_t*>(data); data += 
sizeof(uint32_t);
+       uint32_t tid = *reinterpret_cast<const uint32_t*>(data); data += 
sizeof(uint32_t);
+       uint32_t ptid = *reinterpret_cast<const uint32_t*>(data);
+       consumer->process_exit(ehdr, pid, ppid, tid, ptid);
+       break;
+      }
+    case PERF_RECORD_FORK:
+      {
+       const uint8_t* data = reinterpret_cast<const uint8_t*>(ehdr) + 
sizeof(perf_event_header);
+       uint32_t pid = *reinterpret_cast<const uint32_t*>(data); data += 
sizeof(uint32_t);
+       uint32_t ppid = *reinterpret_cast<const uint32_t*>(data); data += 
sizeof(uint32_t);
+       uint32_t tid = *reinterpret_cast<const uint32_t*>(data); data += 
sizeof(uint32_t);
+       uint32_t ptid = *reinterpret_cast<const uint32_t*>(data); data += 
sizeof(uint32_t);
+       consumer->process_fork(ehdr, pid, ppid, tid, ptid);
+       break;
+      }
+    case PERF_RECORD_MMAP2:
+      {
+       const uint8_t* data = reinterpret_cast<const uint8_t*>(ehdr) + 
sizeof(perf_event_header);
+       uint32_t pid = *reinterpret_cast<const uint32_t*>(data); data += 
sizeof(uint32_t);
+       uint32_t tid = *reinterpret_cast<const uint32_t*>(data); data += 
sizeof(uint32_t);
+       uint64_t addr = *reinterpret_cast<const uint64_t*>(data); data += 
sizeof(uint64_t);
+       uint64_t len = *reinterpret_cast<const uint64_t*>(data); data += 
sizeof(uint64_t);
+       uint64_t pgoff = *reinterpret_cast<const uint64_t*>(data); data += 
sizeof(uint64_t);
+       uint8_t build_id_size = 0;
+       const uint8_t* build_id = nullptr;
+       if (ehdr->misc & PERF_RECORD_MISC_MMAP_BUILD_ID)
+         {
+           build_id_size = *reinterpret_cast<const uint8_t*>(data); data += 
sizeof(uint8_t);
+           data += sizeof(uint8_t) + sizeof(uint16_t); // skip padding
+           build_id = reinterpret_cast<const uint8_t*>(data);
+           data += build_id_size;
+         }
+       else
+         {
+           data += 4 + 4 + 8 + 8; // maj, min, ino, ino_generation
+         }
+       data += sizeof(uint32_t) + sizeof(uint32_t); // prot, flags
+       const char* filename = reinterpret_cast<const char*>(data);
+       consumer->process_mmap2(ehdr, pid, tid, addr, len, pgoff, 
build_id_size, build_id, filename);
+       break;
+      }
+    default:
+      break;
+    }
+}
+
+
+////////////////////////////////////////////////////////////////////////
+// perf event consumers
+
+void StatsPerfConsumer::process_comm(const perf_event_header *sample,
+                                    uint32_t pid, uint32_t tid, bool exec, 
const string &comm)
+{
+  if (show_modules)
+    {
+      clog << format("process_comm: pid={} tid={} exec={} comm={}\n", pid, 
tid, exec, comm);
+    }
+}
+
+void StatsPerfConsumer::process_exit(const perf_event_header *sample,
+                                    uint32_t pid, uint32_t ppid,
+                                    uint32_t tid, uint32_t ptid)
+{
+  if (show_modules)
+    {
+      clog << format("process_exit: pid={} ppid={} tid={} ptid={}\n", pid, 
ppid, tid, ptid);
+    }
+}
+
+void StatsPerfConsumer::process_fork(const perf_event_header *sample,
+                                    uint32_t pid, uint32_t ppid,
+                                    uint32_t tid, uint32_t ptid)
+{
+  if (show_modules)
+    {
+      clog << format("process_fork: pid={} ppid={} tid={} ptid={}\n", pid, 
ppid, tid, ptid);
+    }
+}
+
+void StatsPerfConsumer::process_sample(const perf_event_header *sample,
+                                      uint64_t ip,
+                                      uint32_t pid, uint32_t tid,
+                                      uint64_t time,
+                                      uint64_t abi,
+                                      uint32_t nregs, const uint64_t *regs,
+                                      uint64_t data_size, const uint8_t *data)
+{
+  if (show_samples)
+    {
+      clog << format("process_sample: pid={:d} tid={:d} ip={:x} time={:d} 
abi={:d} nregs={:d} data_size={:d}\n",
+                         pid, tid, ip, time, abi, nregs, data_size);
+    }
+}
+
+void StatsPerfConsumer::process_mmap2(const perf_event_header *sample,
+                                     uint32_t pid, uint32_t tid,
+                                     uint64_t addr, uint64_t len, uint64_t 
pgoff,
+                                     uint8_t build_id_size, const uint8_t 
*build_id,
+                                     const char *filename)
+{
+  if (show_modules)
+    {
+      clog << format("process_mmap2: pid={:d} tid={:d} addr={:x} len={:x} 
pgoff={:x} build_id_size={:d} filename={:s}\n",
+                         pid, tid, addr, len, pgoff, (unsigned)build_id_size, 
filename);
+    }
+}
+
+StatsPerfConsumer::~StatsPerfConsumer()
+{
+  for (const auto& kv : this->event_type_counts)
+    {
+      clog << format("event type {} count {}\n", kv.first, kv.second);
+    }
+}
+
+void StatsPerfConsumer::process(const perf_event_header* ehdr)
+{
+  this->event_type_counts[ehdr->type] ++;
+}
+
+
+//////////////////////////////////////////////////////////////////////
+// unwind stats table for PerfConsumerUnwinder + downstream consumers
+
+UnwindDwflStats *UnwindStatsTable::pid_find_or_create (pid_t pid)
+{
+  if (this->dwfl_tab.count(pid) == 0)
+    this->dwfl_tab.emplace(pid, UnwindDwflStats());
+  return &this->dwfl_tab[pid];
+}
+
+static const string unknown_comm = "<unknown>";
+
+string UnwindStatsTable::pid_find_comm (pid_t pid)
+{
+  UnwindDwflStats *entry = this->pid_find_or_create(pid);
+  if (entry == NULL)
+    return unknown_comm;
+  if (!entry->comm.empty())
+    return entry->comm;
+  string name = format("/proc/{}/comm", pid);
+  ifstream procfile(name);
+  string buf;
+  if (!procfile || !getline(procfile, buf))
+    entry->comm = unknown_comm;
+  else
+    entry->comm = buf;
+
+  return entry->comm;
+}
+
+Dwfl *UnwindStatsTable::pid_find_dwfl (pid_t pid)
+{
+  if (this->dwfl_tab.count(pid) == 0)
+    return NULL;
+  return this->dwfl_tab[pid].dwfl;
+}
+
+void UnwindStatsTable::pid_store_dwfl (pid_t pid, Dwfl *dwfl)
+{
+  UnwindDwflStats *entry = this->pid_find_or_create(pid);
+  if (entry == NULL)
+    return;
+  entry->dwfl = dwfl;
+  if (show_summary)
+    this->pid_find_comm(pid);
+  return;
+}
+
+UnwindModuleStats *UnwindStatsTable::buildid_find (string buildid)
+{
+  if (this->buildid_tab.count(buildid) == 0)
+    return NULL;
+  return &this->buildid_tab[buildid];
+}
+
+UnwindModuleStats *UnwindStatsTable::buildid_find_or_create (string buildid, 
Dwfl_Module *mod)
+{
+  if (this->buildid_tab.count(buildid) == 0)
+    {
+      this->buildid_tab.emplace(buildid, UnwindModuleStats());
+      /* TODO: Guess text range for mod? */
+      (void)mod;
+    }
+  return &this->buildid_tab[buildid];
+}
+
+void UnwindStatsTable::print_summary () const
+{
+#define PERCENT(x,tot) ((x+tot == 0)?0.0:((double)x)/((double)tot)*100.0)
+  int total_samples = 0;
+  int total_lost_samples = 0;
+  clog << "\n=== pid / sample counts ===\n";
+  for (auto& p : this->dwfl_tab)
+    {
+      pid_t pid = p.first;
+      const UnwindDwflStats& d = p.second;
+      clog << format(N_("{} {} -- max {} frames, received {} samples, lost {} 
samples ({:.1f}%) (last {}, worst {})\n"),
+              pid, d.comm, d.max_frames,
+              d.total_samples, d.lost_samples,
+              PERCENT(d.lost_samples, d.total_samples),
+              dwfl_unwound_source_str(d.last_unwound),
+              dwfl_unwound_source_str(d.worst_unwound));
+      total_samples += d.total_samples;
+      total_lost_samples += d.lost_samples;
+    }
+  clog << "===\n";
+  clog << format(N_("TOTAL -- received {} samples, lost {} samples, loaded {} 
processes\n"),
+         total_samples, total_lost_samples,
+         this->dwfl_tab.size() /* TODO: If implementing eviction, need to 
maintain a separate count of evicted pids. */);
+  clog << "\n";
+#undef PERCENT
+}
+
+////////////////////////////////////////////////////////////////////////
+// real perf consumer: unwind helpers
+
+PerfConsumerUnwinder::PerfConsumerUnwinder(UnwindSampleConsumer* usc, 
UnwindStatsTable *ust)
+    : consumer(usc), stats(ust) {
+  maxframes = usc->maxframes();
+  this->tracker = dwflst_tracker_begin (&dwfl_cfi_callbacks);
+}
+
+PerfConsumerUnwinder::PerfConsumerUnwinder(UnwindSampleConsumer* usc, 
UnwindStatsTable *ust, PerfReader *reader)
+  : consumer(usc), stats(ust) {
+  maxframes = usc->maxframes();
+  this->reader = reader;
+  this->tracker = dwflst_tracker_begin (&dwfl_cfi_callbacks);
+}
+
+PerfConsumerUnwinder::~PerfConsumerUnwinder() {
+  dwflst_tracker_end (this->tracker);
+}
+
+/* TODO: Could be relocated to libdwfl/linux-pid-attach.c
+   to remove some duplication of existing linux-pid-attach code. */
+int PerfConsumerUnwinder::find_procfile (Dwfl *dwfl, pid_t *pid, Elf **elf, 
int *elf_fd)
+{
+  int err = 0; /* The errno to return. XXX libdwfl would also set this for 
dwfl->attacherr.  */
+
+  /* Make sure to report the actual PID (thread group leader) to
+     dwfl_attach_state.  */
+  string buffer = format("/proc/{}/status", *pid);
+  ifstream procfile(buffer);
+  if (!procfile)
+    {
+      err = errno;
+    fail:
+      return err;
+    }
+
+  string line;
+  while (getline (procfile, line))
+    if (startswith (line.c_str(), "Tgid:"))
+      {
+       errno = 0;
+       char *endptr;
+       long val = strtol (&line.c_str()[5], &endptr, 10);
+       if ((errno == ERANGE && val == LONG_MAX)
+           || *endptr != '\n' || val < 0 || val != (pid_t) val)
+         *pid = 0;
+       else
+         *pid = (pid_t) val;
+       break;
+      }
+
+  if (*pid == 0)
+    {
+      err = ESRCH;
+      goto fail;
+    }
+
+  {
+    string name = format("/proc/{}/task", *pid);
+    DIR *dir = opendir (name.c_str());
+    if (dir == NULL)
+      {
+        err = errno;
+        goto fail;
+      }
+    else
+      closedir(dir);
+  }
+
+  {
+    string name = format("/proc/{}/exe", *pid);
+    *elf_fd = open (name.c_str(), O_RDONLY);
+  }
+  if (*elf_fd >= 0)
+    {
+      *elf = elf_begin (*elf_fd, ELF_C_READ_MMAP, NULL);
+      if (*elf == NULL)
+       {
+         /* Just ignore, dwfl_attach_state will fall back to trying
+            to associate the Dwfl with one of the existing Dwfl_Module
+            ELF images (to know the machine/class backend to use).  */
+         if (verbose)
+           cerr << format(N_("WARNING: find_procfile pid {}: elf not 
found\n"), (long long)*pid);
+         close (*elf_fd);
+         *elf_fd = -1;
+       }
+    }
+  else
+    *elf = NULL;
+  return 0;
+}
+
+Dwfl *PerfConsumerUnwinder::init_dwfl(pid_t pid)
+{
+  Dwfl *dwfl = dwflst_tracker_dwfl_begin (this->tracker);
+
+  int err = dwfl_linux_proc_report (dwfl, pid);
+  if (err < 0)
+    {
+      if (verbose)
+       cerr << format("WARNING: dwfl_linux_proc_report pid {}: {}\n", (long 
long) pid, dwfl_errmsg(-1));
+      return NULL;
+    }
+  err = dwfl_report_end (dwfl, NULL, NULL);
+  if (err != 0)
+    {
+      if (verbose)
+       cerr << format("WARNING: dwfl_report_end pid {}: {}\n", (long long) 
pid, dwfl_errmsg(-1));
+      return NULL;
+    }
+
+  return dwfl;
+}
+
+Dwfl *pcu_init_dwfl_cb (Dwflst_Process_Tracker *cb_tracker __attribute__ 
((unused)),
+                       pid_t pid,
+                       void *arg)
+{
+  PerfConsumerUnwinder *pcu = (PerfConsumerUnwinder *)arg;
+  return pcu->init_dwfl (pid);
+}
+
+uint32_t expected_frame_nregs (Ebl *ebl)
+{
+  int m = ebl_get_elfmachine(ebl);
+  /* For aarch64, we actually use fewer than ebl->frame_nregs to unwind. */
+  if (m == EM_ARM)
+    return 14; /* XXX 16 for 32-bit ARM */
+  /* On x86, expect everything except FLAGS: */
+  if (m == EM_X86_64 || m == EM_386)
+    return ebl_frame_nregs(ebl);
+  /* In general, it's better to be on the permissive side. */
+  return 1;
+}
+
+Dwfl *PerfConsumerUnwinder::find_dwfl(pid_t pid, const uint64_t *regs, 
uint32_t nregs,
+                                     Elf **out_elf, bool *cached)
+{
+  if (nregs < expected_frame_nregs(this->reader->ebl()))
+    {
+      if (verbose)
+       cerr << format(N_("WARNING: find_dwfl: nregs={}, expected at least 
{}\n"), nregs, ebl_frame_nregs(this->reader->ebl()));
+      return NULL;
+    }
+
+  Elf *elf = NULL;
+  Dwfl *dwfl = dwflst_tracker_find_pid (this->tracker, pid, pcu_init_dwfl_cb, 
this);
+  int elf_fd = -1;
+  int err;
+  if (dwfl != NULL && dwfl_pid(dwfl) != -1 /* dwfl is attached */)
+    {
+      *cached = true;
+      goto reuse;
+    }
+  err = this->find_procfile (dwfl, &pid, &elf, &elf_fd);
+  if (err < 0)
+    {
+      if (verbose)
+       cerr << format("WARNING: find_procfile pid {}: {}\n", (long long) pid, 
dwfl_errmsg(-1));
+      return NULL;
+    }
+
+ reuse:
+  this->last_us.sp = regs[this->get_sp_reg(this->last_us.elfclass == 
ELFCLASS32)];
+  this->last_us.base = this->last_us.sp;
+
+  if (!*cached)
+    this->stats->pid_store_dwfl (pid, dwfl);
+  *out_elf = elf;
+  return dwfl;
+}
+
+/* Index of stack pointer within dwarf_regs order: */
+int PerfConsumerUnwinder::get_sp_reg(bool is_abi32)
+{
+  int machine = ebl_get_elfmachine(this->reader->ebl());
+  if (machine == EM_X86_64 || machine == EM_386) return is_abi32 ? 4 : 7;
+  else if (machine == EM_ARM) return is_abi32 ? 13 : 31;
+  else { assert(0); return 7; }
+}
+
+int PerfConsumerUnwinder::unwind_frame_cb(Dwfl_Frame *state)
+{
+  Dwarf_Addr pc;
+  bool isactivation;
+  if (! dwfl_frame_pc (state, &pc, &isactivation))
+    {
+      if (verbose)
+       cerr << format("WARNING: dwfl_frame_pc: {}\n", dwfl_errmsg(-1));
+      return DWARF_CB_ABORT;
+    }
+
+  Dwarf_Addr pc_adjusted = pc - (isactivation ? 0 : 1);
+  Dwarf_Addr sp;
+
+  int is_abi32 = (this->last_us.elfclass == ELFCLASS32);
+  int user_regs_sp = this->get_sp_reg(is_abi32);
+  int rc = dwfl_frame_reg (state, user_regs_sp, &sp);
+  if (rc < 0)
+    {
+      if (verbose)
+       cerr << format("WARNING: dwfl_frame_reg: {}\n", dwfl_errmsg(-1));
+      return DWARF_CB_ABORT;
+    }
+
+  UnwindDwflStats *dwfl_ent = 
this->stats->pid_find_or_create(this->last_us.pid);
+  if (dwfl_ent != NULL)
+    {
+      Dwfl_Unwound_Source unwound_source = dwfl_frame_unwound_source(state);
+      if (unwound_source > dwfl_ent->worst_unwound)
+       dwfl_ent->worst_unwound = unwound_source;
+      dwfl_ent->last_unwound = unwound_source;
+      if (show_frames)
+       {
+         Dwfl_Module *m = dwfl_addrmodule(this->last_us.dwfl, pc);
+         uint64_t rel_pc = pc_adjusted;
+         int j = dwfl_module_relocate_address (m, &rel_pc);
+         (void) j;
+         clog << format("* frame {:d}: rel_pc={:x} raw_pc={:x} sp={:x}+{:x} 
[{}]\n",
+                        this->last_us.addrs.size(), rel_pc, pc_adjusted, 
this->last_us.base, (sp - this->last_us.base), 
dwfl_unwound_source_str(unwound_source));
+       }
+    }
+  else
+    {
+      if (show_frames)
+       {
+         Dwfl_Module *m = dwfl_addrmodule(this->last_us.dwfl, pc);
+         uint64_t rel_pc = pc_adjusted;
+         int j = dwfl_module_relocate_address (m, &rel_pc);
+         (void) j;
+         clog << format(N_("* frame {:d}: rel_pc={:x} raw_pc={:x} sp={:x}+{:x} 
[dwfl_ent not found]\n"),
+                        this->last_us.addrs.size(), rel_pc, pc_adjusted, 
this->last_us.base, (sp - this->last_us.base));
+       }
+    }
+  if (show_debugfile)
+    {
+      Dwfl_Module *m = dwfl_addrmodule(this->last_us.dwfl, pc);
+      if (m == NULL)
+       {
+         clog << format("* pid {:d} pc={:x} -> MODULE NOT FOUND\n",
+                        this->last_us.pid, pc);
+       }
+      else
+       {
+         const unsigned char *desc;
+         GElf_Addr vaddr;
+         int build_id_len = dwfl_module_build_id (m, &desc, &vaddr);
+         clog << format("* pid {:d} build_id=", this->last_us.pid);
+         for (int i = 0; i < build_id_len; ++i)
+           clog << format("{:02x}", static_cast<int>(desc[i]));
+
+         const char *mainfile;
+         const char *debugfile;
+         const char *modname = dwfl_module_info (m, NULL, NULL, NULL, NULL,
+                                                 NULL, &mainfile, &debugfile);
+         clog << format("module={} mainfile={} debugfile={}\n",
+                        modname,
+                        mainfile ? mainfile : "<none>",
+                        debugfile ? debugfile : "<none>");
+         /* TODO: Also store this data to avoid repeated extraction for
+            the final buildid summary?  */
+#ifdef DEBUG_MODULES
+         Dwarf_Addr bias;
+         Dwarf_CFI *cfi_eh = dwfl_module_eh_cfi (m, &bias);
+         if (cfi_eh == NULL)
+           clog << format("* pc={:x} -> NO EH_CFI\n", pc);
+#endif
+       }
+    }
+
+  this->last_us.sp = sp;
+  this->last_us.addrs.push_back(pc);
+
+  /* e.g. gmon callgraphs only requires maxframes=1
+     (initial pc + one frame for caller ID only) */
+  if (this->last_us.addrs.size() > this->maxframes)
+    {
+      /* XXX without maxframes, very rarely, the unwinder can loop
+        infinitely; worth investigating? */
+      return DWARF_CB_ABORT;
+    }
+  return DWARF_CB_OK;
+}
+
+int pcu_unwind_frame_cb(Dwfl_Frame *state, void *arg)
+{
+  PerfConsumerUnwinder *pcu = (PerfConsumerUnwinder *)arg;
+  return pcu->unwind_frame_cb(state);
+}
+
+
+////////////////////////////////////////////////////////////////////////
+// real perf consumer: event handler callbacks
+
+void PerfConsumerUnwinder::process_comm(const perf_event_header *sample,
+                                       uint32_t pid, uint32_t tid, bool exec, 
const string &comm)
+{
+  // NB: Could have dwflst ditch data for process and start anew, if EXEC.
+  // XXX: is this needed to avoid gradual memory leaks or pid reuse?
+}
+
+void PerfConsumerUnwinder::process_exit(const perf_event_header *sample,
+                                       uint32_t pid, uint32_t ppid,
+                                       uint32_t tid, uint32_t ptid)
+{
+  // NB: Could have dwflst ditch data for process.
+  // XXX: is this needed to avoid gradual memory leaks or pid reuse?
+}
+
+void PerfConsumerUnwinder::process_fork(const perf_event_header *sample,
+                                       uint32_t pid, uint32_t ppid,
+                                       uint32_t tid, uint32_t ptid)
+{
+  // NB: Could have dwflst begin tracking a new process, but
+  // this will likely happen automatically when a packet is received
+  // from it.  The short duration between fork/exec typically means
+  // elfutils will pick up on the post-exec process -- we would have
+  // to work hard to replicate a situation where
+  // process_fork/process_comm handling are needed.
+}
+
+void PerfConsumerUnwinder::process_sample(const perf_event_header *sample,
+                                         uint64_t ip,
+                                         uint32_t pid, uint32_t tid,
+                                         uint64_t time,
+                                         uint64_t abi,
+                                         uint32_t nregs, const uint64_t *regs,
+                                         uint64_t data_size, const uint8_t 
*data)
+{
+  string comm;
+  if (show_summary)
+    comm = this->stats->pid_find_comm(pid);
+
+  if (show_frames)
+    clog << "\n"; /* extra newline for padding */
+
+  Elf *elf = NULL; // Released during dwflst_tracker_end
+  bool cached = false;
+  Dwfl *dwfl = this->find_dwfl (pid, regs, nregs, &elf, &cached);
+  UnwindDwflStats *dwfl_ent = NULL;
+  bool first_load = false; /* -> for show_modules: pid is loaded first time */
+  if (verbose || show_summary || show_modules)
+    {
+      if (dwfl_ent == NULL)
+       dwfl_ent = this->stats->pid_find_or_create(pid);
+      if (dwfl_ent->total_samples == 0)
+       first_load = true;
+    }
+  if (dwfl == NULL)
+    {
+      if (show_summary || show_modules)
+       {
+         /* dwfl_ent loaded above */
+         dwfl_ent->total_samples++;
+         dwfl_ent->lost_samples++;
+       }
+      if (verbose && show_summary)
+       {
+         cerr << format("WARNING: find_dwfl pid {} ({}) (failed)\n", (long 
long)pid, comm);
+       }
+      else
+       {
+         cerr << format("WARNING: find_dwfl pid {} (failed)\n", (long 
long)pid);
+       }
+      return;
+    }
+
+  if (show_samples || (first_load && show_modules))
+    {
+      bool is_abi32 = (abi == PERF_SAMPLE_REGS_ABI_32);
+      clog << format("find_dwfl {}pid {:d} {}({}): hdr_size={:d} size={:d}{} 
pc={:x} sp={:x}+{:d}\n",
+                    first_load ? "newly seen " : "", (long long)pid,
+                    (cached ? "(cached) " : ""), comm,
+                    sample->size, data_size,
+                    (is_abi32 ? " (32-bit)" : ""), ip,
+                    this->last_us.base, 0);
+    }
+
+  this->last_us.addrs.clear();
+  this->last_us.elfclass = (abi == PERF_SAMPLE_REGS_ABI_32 ? ELFCLASS32 : 
ELFCLASS64);
+  this->last_us.dwfl = dwfl;
+  this->last_us.pid = pid;
+  int rc = dwflst_perf_sample_getframes (dwfl, elf, pid, tid,
+                                        data, data_size,
+                                        regs, nregs,
+                                        this->reader->regs_mask(), abi,
+                                        pcu_unwind_frame_cb, this);
+  if (rc < 0)
+    {
+      /* dwfl_ent loaded above */
+      if (verbose && dwfl_ent->shown_errors < 10)
+       {
+         dwfl_ent->shown_errors ++;
+         cerr << format("WARNING: dwflst_perf_sample_getframes pid {}: {}{}\n",
+                        (long long)pid, dwfl_errmsg(-1),
+                        dwfl_ent->shown_errors >= 10 ?
+                        " (...suppressing further warnings for this pid)" : 
"");
+       }
+    }
+  if (show_summary)
+    {
+      /* For final diagnostics.  dwfl_ent loaded above */
+      if (this->last_us.addrs.size() > (unsigned long)dwfl_ent->max_frames)
+       dwfl_ent->max_frames = this->last_us.addrs.size();
+      dwfl_ent->total_samples++;
+      if (this->maxframes > 2 && this->last_us.addrs.size() <= 2)
+       dwfl_ent->lost_samples++;
+    }
+
+  this->consumer->process (&this->last_us);
+  return;
+}
+
+void PerfConsumerUnwinder::process_mmap2(const perf_event_header *sample,
+                                        uint32_t pid, uint32_t tid,
+                                        uint64_t addr, uint64_t len, uint64_t 
pgoff,
+                                        uint8_t build_id_size, const uint8_t 
*build_id,
+                                        const char *filename)
+{
+  Dwfl *dwfl = this->stats->pid_find_dwfl(pid);
+  if (dwfl != NULL)
+    {
+      dwfl_report_begin_add(dwfl);
+      dwfl_report_module(dwfl, filename, /*start*/ addr, /*end*/ addr + len);
+      dwfl_report_end(dwfl, NULL, NULL);
+    }
+}
+
+
+////////////////////////////////////////////////////////////////////////
+// unwind data consumers // basic statistics
+
+UnwindStatsConsumer::~UnwindStatsConsumer()
+{
+  this->stats->print_summary();
+}
+
+void UnwindStatsConsumer::process(const UnwindSample* sample)
+{
+  /* Most of the logic is handled by UnwindStatsTable. */
+}
+
+int UnwindStatsConsumer::maxframes()
+{
+  return opt_maxframes >= 0 ? opt_maxframes : 256;
+}
+
+
+////////////////////////////////////////////////////////////////////////
+// unwind data consumers // gprof
+
+/* gmon.out file format bits */
+#define GMON_MAGIC "gmon"
+#define GMON_VERSION 1
+
+struct gmon_hdr {
+  char cookie[4];
+  char version[4];
+  char spare[3 * 4];
+};
+
+enum gmon_entry_tag {
+  GMON_TAG_TIME_HIST = 0,
+  GMON_TAG_CG_ARC = 1,
+  GMON_TAG_BB_COUNT = 2,
+};
+
+struct gmon_hist_hdr {
+  uint8_t tag; /* GMON_TAG_TIME_HIST */
+  uint8_t unused[3];
+  uint64_t low_pc;
+  uint64_t high_pc;
+  uint32_t num_buckets;
+  uint32_t prof_rate;
+  char _dimension_string[16];
+};
+
+
+void GprofUnwindSampleConsumer::record_gmon_hist(ostream &of, map<uint64_t, 
uint32_t> &histogram, uint64_t low_pc, uint64_t high_pc, uint64_t alignment)
+{
+  // write one histogram from low_pc ... high_pc
+  uint32_t num_buckets = (high_pc-low_pc)/alignment + 1;
+  double result_scale = 
(double)((high_pc-low_pc)/sizeof(uint16_t))/num_buckets;
+  if (verbose > 5)
+    /* It's the @scale value that must be kept within 0.000001 of 0.5 to
+       keep gprof from complaining. */
+    clog << format("+histogram {:x}..{:x} (alignment {}) of {} buckets @scale 
{}\n",
+                  low_pc, high_pc, alignment, num_buckets, result_scale);
+
+  // write histogram record header
+  unsigned char tag = GMON_TAG_TIME_HIST;
+  of.write(reinterpret_cast<const char *>(&tag), sizeof(tag));
+  int wordsize = (sizeof (void *) == 8) ? 8 : 4;
+  if (wordsize == 4) {
+    uint32_t addr = low_pc;
+    of.write(reinterpret_cast<const char *>(&addr), sizeof(addr));
+    addr = high_pc;
+    of.write(reinterpret_cast<const char *>(&addr), sizeof(addr));
+  } else {
+    of.write(reinterpret_cast<const char *>(&low_pc), sizeof(low_pc));
+    of.write(reinterpret_cast<const char *>(&high_pc), sizeof(high_pc));
+  }
+  of.write(reinterpret_cast<const char *>(&num_buckets), sizeof(num_buckets));
+  uint32_t prof_rate = attr.sample_freq;
+  of.write(reinterpret_cast<const char *>(&prof_rate), sizeof(prof_rate));
+  // dimension string is 15 chars long (not null terminated)
+  std::string dimension_base = libpfm_event.empty() ? "ticks" :
+    libpfm_event.substr(0, 15);
+  dimension_base.resize(15, '\0');  // ensure exactly 15 bytes
+  of.write(dimension_base.data(), 15);
+  // dimension character abbreviation: just take the first char of above
+  of.write(dimension_base.data(), 1);
+
+  // write histogram buckets
+  uint64_t bucket_addr = low_pc;
+  int n_overflows = 0, max_overflows = 5; // limit 'bucket overflow' spam
+  for (uint32_t bucket = 0; bucket < num_buckets; bucket++)
+    {
+      uint16_t count = 0;
+      for (auto it = histogram.lower_bound(bucket_addr);
+              it != histogram.upper_bound(bucket_addr+alignment-1);
+              it ++)
+       {
+         if (numeric_limits<uint16_t>::max() <= (int) count + (int) it->second)
+           {
+             count = numeric_limits<uint16_t>::max();
+             // XXX: a provisional error message to give a sense of
+             // whether this happens often-enough to do something
+             // more complex, such as adjusting the histogram
+             // granularity:
+             if (n_overflows >= max_overflows) break;
+             n_overflows++;
+             cerr << format("WARNING: histogram bucket overflow at {:x}{}",
+                            bucket_addr,
+                            n_overflows >= max_overflows ?
+                            " (... suppressing further warnings for this 
histogram)" : "")
+                  << endl;
+             break;
+           }
+         count += it->second;
+       }
+      bucket_addr += alignment;
+      of.write(reinterpret_cast<const char *>(&count), sizeof(count));
+    }
+}
+
+void GprofUnwindSampleConsumer::record_gmon_out(const string& buildid, 
UnwindModuleStats& m)
+{
+  string filename = output_dir + "/" + "gmon." + buildid + ".out";
+  string exe_symlink_path = output_dir + "/" + "gmon." + buildid + ".exe";
+  string json_path = output_dir + "/" + "gmon." + buildid + ".json";
+
+  if (output_force) {
+    filesystem::remove(filename);
+    filesystem::remove(exe_symlink_path);
+    filesystem::remove(json_path);
+  }
+
+  string target_path = buildid_to_mainfile[buildid];
+  if (target_path != unknown_comm) // skip .exe symlink if there's no path
+    if (symlink(target_path.c_str(), exe_symlink_path.c_str()) == -1) {
+      // Handle error, e.g., print errno or throw exception
+      cerr << format("WARNING: symlink failed: {}\n", strerror(errno));
+      // NB: no return needed here; proceed to write out other bits.
+      // A smart enough consumer will make do with buildid based executable 
lookup.
+    }
+
+  json_object *metadata = json_object_new_object();
+  if (!metadata) {
+  json_fail:
+    cerr << format("ERROR: json allocation failed: {}\n", strerror(errno));
+    return;
+  }
+  json_object *buildid_js = json_object_new_string(buildid.c_str());
+  if (NULL == buildid_js) goto json_fail;
+  json_object_object_add(metadata, "buildid", buildid_js);
+  if (buildid_to_mainfile.count(buildid) != 0) {
+    const string &mainfile = buildid_to_mainfile[buildid];
+    json_object *mainfile_js = json_object_new_string(mainfile.c_str());
+    if (NULL == mainfile_js) goto json_fail;
+    json_object_object_add(metadata, "mainfile", mainfile_js);
+  }
+  if (buildid_to_debugfile.count(buildid) != 0) {
+    const string &debugfile = buildid_to_debugfile[buildid];
+    json_object *debugfile_js = json_object_new_string(debugfile.c_str());
+    if (NULL == debugfile_js) goto json_fail;
+    json_object_object_add(metadata, "debugfile", debugfile_js);
+  }
+  if (libpfm_event != "") {
+    json_object *event_js = json_object_new_string(libpfm_event.c_str());
+    if (NULL == event_js) goto json_fail;
+    json_object_object_add(metadata, "libpfm-event", event_js);
+  }
+  if (libpfm_event_decoded != "") {
+    json_object *event_js = 
json_object_new_string(libpfm_event_decoded.c_str());
+    if (NULL == event_js) goto json_fail;
+    json_object_object_add(metadata, "libpfm-event-decoded", event_js);
+  }
+  {
+    json_object *br_js = json_object_new_boolean(branch_record);
+    if (NULL == br_js) goto json_fail;
+    json_object_object_add(metadata, "branch-record", br_js);
+  }
+
+  const char *metadata_str = json_object_to_json_string(metadata);
+  if (!metadata_str) goto json_fail;
+  ofstream of_js (json_path);
+  of_js << metadata_str;
+  of_js.close();
+  json_object_put (metadata);
+
+  ofstream of (filename, ios::binary);
+  if (!of)
+    {
+      cerr << format(N_("ERROR: buildid {} -- could not open '{}' for 
writing\n"), buildid, filename);
+    }
+
+  /* Write gmon header.  It and other headers mostly hold
+     native-endian and fixed (or native) bitwidth values.  In
+     principle, we should get the bitness/endianness from the
+     particular executable associated with the buildid.  But, being a
+     live profiler, we don't really have to deal with CROSS
+     architecture work, and for now can just hard-code the bitness to
+     match this host program. XXX
+   */
+  int wordsize = (sizeof (void *) == 8) ? 8 : 4;
+  struct gmon_hdr ghdr;
+  memcpy (&ghdr.cookie[0], GMON_MAGIC, 4);
+  uint32_t version = GMON_VERSION;
+  memcpy (&ghdr.version[0], reinterpret_cast<const char *>(&version), 4);
+  memset (&ghdr.spare[0], 0, sizeof(ghdr.spare));
+  of.write(reinterpret_cast<const char *>(&ghdr), sizeof(ghdr));
+
+  if (m.histogram.size() > 0)
+    {
+      uint64_t low_pc = m.histogram.begin()->first;
+      uint64_t high_pc = m.histogram.rbegin()->first;
+      uint64_t alignment = (high_pc - low_pc + 1) / UINT_MAX + 1;
+
+      if (gmon_hist_split == HIST_SPLIT_NONE)
+       {
+         /* Put everything into one histogram. */
+         this->record_gmon_hist(of, m.histogram, low_pc, high_pc, alignment);
+       }
+      else if (gmon_hist_split == HIST_SPLIT_EVEN)
+       {
+         /* This option attempts to satisfy gprof's histogram scale
+            consistency check, which requires all values
+            '(double)(high_pc-low_pc)/num_buckets' to fall within
+            EPSILON.  In practice, we can only be sure of this if we
+            cover the address space with histograms all one size.  */
+
+         /* Keep the search for 'optimal' size simple -- we just need
+            a plausible order of magnitude.  XXX Some rechecking of
+            correctness needed.  */
+         //uint64_t min_size = 1; // this is 'optimal' much of the time
+         uint64_t min_size = 1024;
+         uint64_t max_size = high_pc - low_pc;
+         uint64_t opt_size = min_size;
+         uint64_t opt_est = 0;
+         uint64_t next_size = opt_size;
+         while (next_size < max_size)
+           {
+             if (next_size > max_size)
+               next_size = max_size;
+             uint64_t size_inc = sizeof(struct gmon_hdr) + next_size;
+             uint64_t size_est = size_inc;
+             uint64_t pc = low_pc;
+             while (pc + size_est < high_pc)
+               {
+                 auto it = m.histogram.upper_bound(pc + size_est/alignment);
+                 if (it == m.histogram.end())
+                   break;
+                 pc = it->first;
+                 size_est += sizeof(struct gmon_hdr) + next_size;
+               }
+             if (opt_est == 0 || size_est < opt_est)
+               {
+                 opt_size = next_size;
+                 opt_est = size_est;
+               }
+             // if (opt_est > prev_est) break; /* XXX: We've hit the lowest 
point. */
+             next_size = 2 * next_size;
+           }
+
+         /* Partition into histograms of opt_size.
+            XXX: May need to check if low_pc must be aligned.  */
+         uint64_t prev_pc = low_pc;
+         uint64_t pc = prev_pc;
+         for (const auto& p : m.histogram)
+           {
+             pc = p.first;
+             if (pc - low_pc > opt_size)
+               {
+                 /* Record a histogram from low_pc to low_pc+opt_size. */
+                 this->record_gmon_hist(of, m.histogram,
+                                        low_pc, low_pc+opt_size-1 /* >= 
prev_pc */,
+                                        alignment);
+                 low_pc = pc;
+               }
+             prev_pc = pc;
+           }
+         /* Record a final histogram from low_pc to low_pc+opt_size.
+            XXX: Edge case -- may want to adjust for overflow of
+            low_pc+opt_size at end of address space.  */
+         this->record_gmon_hist(of, m.histogram,
+                                low_pc, low_pc+opt_size-1 /* >= prev_pc */,
+                                alignment);
+       }
+      else if (gmon_hist_split == HIST_SPLIT_FLEX)
+       {
+         /* Allow variable-size histograms to save on storage space.
+            Will fail gprof's input consistency checks, XXX but ok
+            for profiledb purposes?  */
+         uint64_t prev_pc = low_pc;
+         uint64_t pc = prev_pc;
+         /* XXX Iterate histogram ascending by key, faster than by addr
+            when we just need to scan for gaps.  */
+         for (const auto& p : m.histogram)
+           {
+             pc = p.first;
+             uint64_t bin_dist = (pc - prev_pc) / alignment;
+             if (bin_dist > sizeof(struct gmon_hist_hdr))
+               /* XXX If we add '&& low_pc != prev_pc && pc != high_pc',
+                  this avoids producing a histogram with only 1 entry,
+                  but this is still not enough to satisfy gprof's
+                  histogram scale calculation.  */
+               {
+                 /* Record a histogram from low_pc to prev_pc. */
+                 this->record_gmon_hist(of, m.histogram, low_pc, prev_pc, 
alignment);
+                 low_pc = pc;
+               }
+             prev_pc = pc;
+           }
+         /* Record a final histogram from low_pc to pc. */
+         this->record_gmon_hist(of, m.histogram, low_pc, pc, alignment);
+       }
+    }
+
+  /* Write call graph arcs. */
+  for (auto& p : m.callgraph)
+    {
+      unsigned char tag = GMON_TAG_CG_ARC;
+      of.write(reinterpret_cast<const char *>(&tag), sizeof(tag));
+      if (wordsize == 4) {
+       uint32_t addr = p.first.first;
+       of.write(reinterpret_cast<const char *>(&addr), sizeof(addr));
+       addr = p.first.second;
+       of.write(reinterpret_cast<const char *>(&addr), sizeof(addr));
+      } else {
+       uint64_t addr = p.first.first;
+       of.write(reinterpret_cast<const char *>(&addr), sizeof(addr));
+       addr = p.first.second;
+       of.write(reinterpret_cast<const char *>(&addr), sizeof(addr));
+      }
+      /* p is (from,to) -> count */
+      uint32_t count = p.second;
+      of.write(reinterpret_cast<const char *>(&count), sizeof(count));
+    }
+
+  of.close();
+}
+
+GprofUnwindSampleConsumer::~GprofUnwindSampleConsumer()
+{
+  if (show_summary)
+    {
+      this->stats->print_summary ();
+      clog << "=== buildid / sample counts ===\n";
+    }
+
+  UnwindStatsTable::buildid_map_t sorted_map 
(this->stats->buildid_tab.begin(), this->stats->buildid_tab.end());
+  for (auto& p : sorted_map) // traverse in sorted order
+    {
+      const string& buildid = p.first;
+      UnwindModuleStats& module_stats = p.second;
+      this->record_gmon_out(buildid, module_stats);
+      if (show_summary)
+        {
+          /* In record_gmon_out we will write the buildid-->path mapping
+             to a json metadata file.  That makes for a reasonable hint;
+             debuginfod-find can be used as a mostly-functional fallback
+             (for packaged rather than locally built executables) if the
+             results are moved to another system.  */
+          string mainfile = "<unknown>";
+          if (buildid_to_mainfile.count(buildid) != 0)
+            mainfile = buildid_to_mainfile[buildid];
+          string debugfile = "";
+          if (buildid_to_debugfile.count(buildid) != 0)
+            debugfile = buildid_to_debugfile[buildid];
+          clog << format(N_("buildid {} ({}{}{}) -- received {} distinct pcs, 
{} callgraph arcs\n"), /* TODO also count samples / estimated histogram size? */
+                         buildid,
+                         mainfile,
+                         debugfile.empty() ? "" : " +debugfile ",
+                         debugfile,
+                         module_stats.histogram.size(),
+                         module_stats.callgraph.size());
+        }
+    }
+  if (show_summary)
+    {
+      clog << "===\n";
+      clog << format(N_("TOTAL -- received {} buildids\n"), 
this->stats->buildid_tab.size());
+    }
+  clog << "\n";
+}
+
+
+int
+GprofUnwindSampleConsumer::maxframes()
+{
+  // gprof only needs one level of backtracing,
+  // but user can override consumer's preference
+  // with --maxframes option:
+  return opt_maxframes >= 0 ? opt_maxframes : 1;
+}
+
+
+void GprofUnwindSampleConsumer::process(const UnwindSample *sample)
+{
+  if (sample->addrs.size() < 1)
+    return; /* edge case -- no pc or callgraph arc */
+
+  Dwarf_Addr pc = sample->addrs[0];
+  Dwarf_Addr pc2 = sample->addrs.size() < 2 ? 0 : sample->addrs[1];
+
+  Dwfl_Module *mod = dwfl_addrmodule(sample->dwfl, pc);
+  if (mod == NULL)
+    return;
+#if 0
+  Dwarf_Addr bias;
+  Elf *elf = dwfl_module_getelf (mod, &bias);
+  (void)elf;
+#endif
+
+  Dwfl_Module *mod2 = dwfl_addrmodule(sample->dwfl, pc2);
+  // XXX: allowing mod2 == NULL -- callgraph arc will be skipped
+
+  // extract buildid for pc (hit callee)
+  const unsigned char *desc = nullptr;
+  GElf_Addr vaddr;
+  int build_id_len = dwfl_module_build_id(mod, &desc, &vaddr);
+  if (build_id_len <= 0)
+    return; // TODO: report/tabulate hit outside known modules
+
+  // possible optimization would be to use the unconverted build_id_desc as 
hash key
+  string buildid;
+  for (int i = 0; i < build_id_len; ++i) {
+    buildid += format("{:02x}", static_cast<int>(desc[i]));
+  }
+
+  const char *mainfile_cstr;
+  const char *debugfile_cstr;
+  Dwarf_Addr low_addr;
+  Dwarf_Addr high_addr;
+  dwfl_module_info (mod, NULL, &low_addr, &high_addr, NULL,
+                   NULL, &mainfile_cstr, &debugfile_cstr);
+  string mainfile = mainfile_cstr ? mainfile_cstr : "<unknown>";
+  string debugfile = debugfile_cstr ? debugfile_cstr : "";
+  if (!buildid_to_mainfile.count(buildid))
+    buildid_to_mainfile[buildid] = mainfile;
+  if (!buildid_to_debugfile.count(buildid))
+    buildid_to_debugfile[buildid] = debugfile;
+  /* XXX: Also monitor for collisions here? */
+
+  UnwindModuleStats *buildid_ent = 
this->stats->buildid_find_or_create(buildid, mod);
+
+  uint64_t last_pc = pc;
+  int i = dwfl_module_relocate_address (mod, &pc);
+  /* XXX: Out-of-range address seen with ld-linux.so, not useful for profiledb 
purposes: */
+  if ((last_pc < low_addr || last_pc > high_addr))
+    {
+      if (verbose)
+       clog << format(N_("{}: Skipping pc={:x} raw_pc={:x} outside module 
range start={:x}..end={:x}\n"),
+                      mainfile, pc, last_pc, low_addr, high_addr);
+      return;
+    }
+  (void) i;
+  // XXX: could get dwfl_module_relocation_info (mod, i, NULL), but no need?
+  buildid_ent->record_pc(pc);
+
+  // If caller & callee are in different modules, this is a 
cross-shared-library
+  // call, so we can't track it as a call-graph arc.  TODO: at least count them
+  if (sample->addrs.size() >= 2 && mod == mod2) // intra-module call
+    {
+      last_pc = pc2;
+      int j = dwfl_module_relocate_address (mod, &pc2); // map pc2 also
+      if (last_pc < low_addr || last_pc > high_addr)
+       {
+         if (verbose)
+           clog << format(N_("{}: Skipping pc={:x} raw_pc={:x} outside module 
range start={:x}..end={:x}\n"),
+                          mainfile, pc2, last_pc, low_addr, high_addr);
+         return;
+       }
+      (void) j;
+      buildid_ent->record_callgraph_arc(pc2, pc);
+    }
+}
+
-- 
2.53.0


Reply via email to