The branch main has been updated by bnovkov:

URL: 
https://cgit.FreeBSD.org/src/commit/?id=310162ea218a6e6b9d5b2cc6c39f8634f58dc555

commit 310162ea218a6e6b9d5b2cc6c39f8634f58dc555
Author:     Bojan Novković <bnov...@freebsd.org>
AuthorDate: 2025-07-16 15:32:18 +0000
Commit:     Bojan Novković <bnov...@freebsd.org>
CommitDate: 2025-07-16 16:40:37 +0000

    hwt(4): Add Intel Processor Trace backend
    
    Differential Revision:  https://reviews.freebsd.org/D46397
    Reviewed by:    kib, markj, br
---
 sys/amd64/pt/pt.c       | 977 ++++++++++++++++++++++++++++++++++++++++++++++++
 sys/amd64/pt/pt.h       |  49 +++
 sys/modules/Makefile    |   2 +
 sys/modules/pt/Makefile |   8 +
 4 files changed, 1036 insertions(+)

diff --git a/sys/amd64/pt/pt.c b/sys/amd64/pt/pt.c
new file mode 100644
index 000000000000..d96da1c1ac17
--- /dev/null
+++ b/sys/amd64/pt/pt.c
@@ -0,0 +1,977 @@
+/*
+ * Copyright (c) 2025 Bojan Novković <bnov...@freebsd.org>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+/*
+ * hwt(4) Intel Processor Trace (PT) backend
+ *
+ * Driver Design Overview
+ *
+ * - Since PT is configured on a per-core basis, the driver uses
+ *   'smp_rendezvous' to start and disable tracing on each target core.
+ * - PT-specific resources are stored in a 'struct pt_ctx' context structure 
for
+ *   each traced CPU core or thread. Upon initialization, a ToPA configuration
+ *   is generated for each 'pt_ctx' structure using the HWT tracing buffers.
+ *   The HWT tracing buffer is split into 4K ToPA entries. Currently, each
+ *   4K ToPA entry is configured to trigger an interrupt after it is filled.
+ * - The PT driver uses the XSAVE/XRSTOR PT extensions to load and save all
+ *   relevant PT registers. Every time a traced thread is switched
+ *   out or in, its state will be saved to or loaded from its corresponding
+ *   'pt_ctx' context.
+ * - When tracing starts, the PT hardware will start writing data into the
+ *   tracing buffer. When a TOPA_INT entry is filled, it will trigger an
+ *   interrupt before continuing. The interrupt handler will then fetch the
+ *   last valid tracing buffer offset and enqueue a HWT_RECORD_BUFFER record.
+ *   The driver is currently configured to use the NMI interrupt line.
+ * - The userspace PT backend waits for incoming HWT_RECORD_BUFFER records
+ *   and uses the offsets to decode data from the tracing buffer.
+ *
+ * Future improvements and limitations
+ *
+ * - We currently configure the PT hardware to trigger an interrupt whenever
+ *   a 4K ToPA entry is filled. While this is fine when tracing smaller
+ *   functions or infrequent code paths, this will generate too much interrupt
+ *   traffic when tracing hotter functions. A proper solution for this issue
+ *   should estimate the amount of data generated by the current configuration
+ *   and use it to determine interrupt frequency.
+ *
+ * - Support for more tracing options and PT features.
+ *
+ */
+
+#include <sys/systm.h>
+#include <sys/hwt.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/sdt.h>
+#include <sys/smp.h>
+#include <sys/taskqueue.h>
+
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+
+#include <machine/atomic.h>
+#include <machine/cpufunc.h>
+#include <machine/fpu.h>
+#include <machine/smp.h>
+#include <machine/specialreg.h>
+
+#include <x86/apicvar.h>
+#include <x86/x86_var.h>
+
+#include <dev/hwt/hwt_context.h>
+#include <dev/hwt/hwt_vm.h>
+#include <dev/hwt/hwt_backend.h>
+#include <dev/hwt/hwt_config.h>
+#include <dev/hwt/hwt_cpu.h>
+#include <dev/hwt/hwt_record.h>
+#include <dev/hwt/hwt_thread.h>
+
+#include <amd64/pt/pt.h>
+
+#ifdef PT_DEBUG
+#define dprintf(fmt, ...) printf(fmt, ##__VA_ARGS__)
+#else
+#define dprintf(fmt, ...)
+#endif
+#define PT_SUPPORTED_FLAGS                                             \
+       (RTIT_CTL_MTCEN | RTIT_CTL_CR3FILTER | RTIT_CTL_DIS_TNT |       \
+           RTIT_CTL_USER | RTIT_CTL_OS | RTIT_CTL_BRANCHEN)
+#define PT_XSAVE_MASK (XFEATURE_ENABLED_X87 | XFEATURE_ENABLED_SSE)
+#define PT_XSTATE_BV (PT_XSAVE_MASK | XFEATURE_ENABLED_PT)
+#define PT_MAX_IP_RANGES 2
+
+#define PT_TOPA_MASK_PTRS 0x7f
+#define PT_TOPA_PAGE_MASK 0xffffff80
+#define PT_TOPA_PAGE_SHIFT 7
+
+#define CPUID_PT_LEAF  0x14
+
+MALLOC_DEFINE(M_PT, "pt", "Intel Processor Trace");
+
+SDT_PROVIDER_DEFINE(pt);
+SDT_PROBE_DEFINE(pt, , , topa__intr);
+
+TASKQUEUE_FAST_DEFINE_THREAD(pt);
+
+static void pt_send_buffer_record(void *arg, int pending __unused);
+static int pt_topa_intr(struct trapframe *tf);
+
+/*
+ * Intel Processor Trace XSAVE-managed state.
+ */
+struct pt_ext_area {
+       uint64_t rtit_ctl;
+       uint64_t rtit_output_base;
+       uint64_t rtit_output_mask_ptrs;
+       uint64_t rtit_status;
+       uint64_t rtit_cr3_match;
+       uint64_t rtit_addr0_a;
+       uint64_t rtit_addr0_b;
+       uint64_t rtit_addr1_a;
+       uint64_t rtit_addr1_b;
+};
+
+struct pt_buffer {
+       uint64_t *topa_hw; /* ToPA table entries. */
+       size_t size;
+       struct mtx lock; /* Lock for fields below. */
+       vm_offset_t offset;
+       uint64_t wrap_count;
+       int curpage;
+};
+
+struct pt_ctx {
+       int id;
+       struct pt_buffer buf; /* ToPA buffer metadata */
+       struct task task;     /* ToPA buffer notification task */
+       struct hwt_context *hwt_ctx;
+       uint8_t *save_area; /* PT XSAVE area */
+};
+/* PT tracing contexts used for CPU mode. */
+static struct pt_ctx *pt_pcpu_ctx;
+
+enum pt_cpu_state {
+       PT_DISABLED = 0,
+       PT_STOPPED,
+       PT_ACTIVE
+};
+
+static struct pt_cpu {
+       struct pt_ctx *ctx;      /* active PT tracing context */
+       enum pt_cpu_state state; /* used as part of trace stop protocol */
+} *pt_pcpu;
+
+/*
+ * PT-related CPUID bits.
+ */
+static struct pt_cpu_info {
+       uint32_t l0_eax;
+       uint32_t l0_ebx;
+       uint32_t l0_ecx;
+       uint32_t l1_eax;
+       uint32_t l1_ebx;
+       size_t xsave_area_size;
+       size_t xstate_hdr_offset;
+       size_t pt_xsave_offset;
+} pt_info  __read_mostly;
+
+static bool initialized = false;
+static int cpu_mode_ctr = 0;
+
+static __inline enum pt_cpu_state
+pt_cpu_get_state(int cpu_id)
+{
+       return (atomic_load_int(&pt_pcpu[cpu_id].state));
+}
+
+static __inline void
+pt_cpu_set_state(int cpu_id, enum pt_cpu_state state)
+{
+       atomic_store_int(&pt_pcpu[cpu_id].state, state);
+}
+
+static __inline struct xstate_hdr *
+pt_ctx_get_xstate_hdr(struct pt_ctx *ctx)
+{
+       return ((struct xstate_hdr *)(ctx->save_area +
+           pt_info.xstate_hdr_offset));
+}
+
+
+static __inline struct pt_ext_area *
+pt_ctx_get_ext_area(struct pt_ctx *ctx)
+{
+       return ((struct pt_ext_area *)(ctx->save_area +
+           pt_info.pt_xsave_offset));
+}
+
+/*
+ * Updates current trace buffer offset from the
+ * ToPA MSRs. Records if the trace buffer wrapped.
+ */
+static __inline void
+pt_update_buffer(struct pt_buffer *buf)
+{
+       uint64_t reg;
+       int curpage;
+
+       /* Update buffer offset. */
+       reg = rdmsr(MSR_IA32_RTIT_OUTPUT_MASK_PTRS);
+       curpage = (reg & PT_TOPA_PAGE_MASK) >> PT_TOPA_PAGE_SHIFT;
+       mtx_lock_spin(&buf->lock);
+       /* Check if the output wrapped. */
+       if (buf->curpage > curpage)
+               buf->wrap_count++;
+       buf->curpage = curpage;
+       buf->offset = reg >> 32;
+       mtx_unlock_spin(&buf->lock);
+
+       dprintf("%s: wrap_cnt: %lu, curpage: %d, offset: %zu\n", __func__,
+           buf->wrap_count, buf->curpage, buf->offset);
+}
+
+static __inline void
+pt_fill_buffer_record(int id, struct pt_buffer *buf,
+    struct hwt_record_entry *rec)
+{
+       rec->record_type = HWT_RECORD_BUFFER;
+       rec->buf_id = id;
+       rec->curpage = buf->curpage;
+       rec->offset = buf->offset + (buf->wrap_count * buf->size);
+}
+
+/*
+ * Enables or disables tracing on curcpu
+ * using the XSAVE/XRSTOR PT extensions.
+ */
+static void
+pt_cpu_toggle_local(uint8_t *save_area, bool enable)
+{
+       u_long xcr0, cr0;
+       u_long xss;
+
+       cr0 = rcr0();
+       if (cr0 & CR0_TS)
+               clts();
+       xcr0 = rxcr(XCR0);
+       if ((xcr0 & PT_XSAVE_MASK) != PT_XSAVE_MASK)
+               load_xcr(XCR0, xcr0 | PT_XSAVE_MASK);
+       xss = rdmsr(MSR_IA32_XSS);
+       wrmsr(MSR_IA32_XSS, xss | XFEATURE_ENABLED_PT);
+
+       if (!enable) {
+               KASSERT((rdmsr(MSR_IA32_RTIT_CTL) & RTIT_CTL_TRACEEN) != 0,
+                   ("%s: PT is disabled", __func__));
+               xsaves(save_area, XFEATURE_ENABLED_PT);
+       } else {
+               KASSERT((rdmsr(MSR_IA32_RTIT_CTL) & RTIT_CTL_TRACEEN) == 0,
+                   ("%s: PT is enabled", __func__));
+               xrstors(save_area, XFEATURE_ENABLED_PT);
+       }
+       wrmsr(MSR_IA32_XSS, xss);
+       if ((xcr0 & PT_XSAVE_MASK) != PT_XSAVE_MASK)
+               load_xcr(XCR0, xcr0);
+       if (cr0 & CR0_TS)
+               load_cr0(cr0);
+}
+
+/*
+ * Starts PT tracing on 'curcpu'.
+ */
+static void
+pt_cpu_start(void *dummy)
+{
+       struct pt_cpu *cpu;
+
+       cpu = &pt_pcpu[curcpu];
+       MPASS(cpu->ctx != NULL);
+
+       dprintf("%s: curcpu %d\n", __func__, curcpu);
+       load_cr4(rcr4() | CR4_XSAVE);
+       wrmsr(MSR_IA32_RTIT_STATUS, 0);
+       pt_cpu_set_state(curcpu, PT_ACTIVE);
+       pt_cpu_toggle_local(cpu->ctx->save_area, true);
+}
+
+/*
+ * Stops PT tracing on 'curcpu'.
+ * Updates trace buffer offset to ensure
+ * any data generated between the last interrupt
+ * and the trace stop gets picked up by userspace.
+ */
+static void
+pt_cpu_stop(void *dummy)
+{
+       struct pt_cpu *cpu;
+       struct pt_ctx *ctx;
+
+       /* Shutdown may occur before PT gets properly configured. */
+       if (pt_cpu_get_state(curcpu) == PT_DISABLED)
+               return;
+
+       cpu = &pt_pcpu[curcpu];
+       ctx = cpu->ctx;
+       MPASS(ctx != NULL);
+       dprintf("%s: curcpu %d\n", __func__, curcpu);
+
+       pt_cpu_set_state(curcpu, PT_STOPPED);
+       pt_cpu_toggle_local(cpu->ctx->save_area, false);
+       pt_update_buffer(&ctx->buf);
+}
+
+/*
+ * Prepares the Table of Physical Addresses (ToPA) metadata for 'pt_ctx'.
+ * The HWT trace buffer is split into 4K ToPA table entries and used
+ * as a circular buffer, meaning that the last ToPA entry points to
+ * the first ToPA entry. Each entry is configured to raise an
+ * interrupt after being filled.
+ */
+static int
+pt_topa_prepare(struct pt_ctx *ctx, struct hwt_vm *vm)
+{
+       struct pt_buffer *buf;
+       size_t topa_size;
+       int i;
+
+       topa_size = TOPA_SIZE_4K;
+       buf = &ctx->buf;
+
+       KASSERT(buf->topa_hw == NULL,
+           ("%s: ToPA info already exists", __func__));
+       buf->topa_hw = mallocarray(vm->npages + 1, sizeof(uint64_t), M_PT,
+           M_ZERO | M_WAITOK);
+       dprintf("%s: ToPA virt addr %p\n", __func__, buf->topa_hw);
+       buf->size = vm->npages * PAGE_SIZE;
+       for (i = 0; i < vm->npages; i++) {
+               buf->topa_hw[i] = VM_PAGE_TO_PHYS(vm->pages[i]) | topa_size;
+               /*
+                * XXX: TOPA_INT should ideally be set according to
+                * expected amount of incoming trace data. Too few TOPA_INT
+                * entries will not trigger interrupts often enough when tracing
+                * smaller functions.
+                */
+               buf->topa_hw[i] |= TOPA_INT;
+       }
+       buf->topa_hw[vm->npages] = (uint64_t)vtophys(buf->topa_hw) | TOPA_END;
+
+       return (0);
+}
+
+/*
+ * Configures IP filtering for trace generation.
+ * A maximum of 2 ranges can be specified due to
+ * limitations imposed by the XSAVE/XRSTOR PT extensions.
+ */
+static int
+pt_configure_ranges(struct pt_ctx *ctx, struct pt_cpu_config *cfg)
+{
+       struct pt_ext_area *pt_ext;
+       int nranges_supp, n, error = 0;
+
+       pt_ext = pt_ctx_get_ext_area(ctx);
+       if (pt_info.l0_ebx & CPUPT_IPF) {
+               nranges_supp = (pt_info.l1_eax & CPUPT_NADDR_M) >>
+                   CPUPT_NADDR_S;
+
+               if (nranges_supp > PT_IP_FILTER_MAX_RANGES)
+                       nranges_supp = PT_IP_FILTER_MAX_RANGES;
+               n = cfg->nranges;
+               if (n > nranges_supp) {
+                       printf("%s: %d IP filtering ranges requested, CPU "
+                              "supports %d, truncating\n",
+                           __func__, n, nranges_supp);
+                       n = nranges_supp;
+               }
+
+               switch (n) {
+               case 2:
+                       pt_ext->rtit_ctl |= (1UL << RTIT_CTL_ADDR_CFG_S(1));
+                       pt_ext->rtit_addr1_a = cfg->ip_ranges[1].start;
+                       pt_ext->rtit_addr1_b = cfg->ip_ranges[1].end;
+               case 1:
+                       pt_ext->rtit_ctl |= (1UL << RTIT_CTL_ADDR_CFG_S(0));
+                       pt_ext->rtit_addr0_a = cfg->ip_ranges[0].start;
+                       pt_ext->rtit_addr0_b = cfg->ip_ranges[0].end;
+                       break;
+               default:
+                       error = (EINVAL);
+                       break;
+               };
+       } else
+               error = (ENXIO);
+
+       return (error);
+}
+
+static int
+pt_init_ctx(struct pt_ctx *pt_ctx, struct hwt_vm *vm, int ctx_id)
+{
+
+       dprintf("%s: ctx id %d\n", __func__, ctx_id);
+
+       KASSERT(pt_ctx->buf.topa_hw == NULL,
+           ("%s: active ToPA buffer in context %p\n", __func__, pt_ctx));
+
+       memset(pt_ctx, 0, sizeof(struct pt_ctx));
+       mtx_init(&pt_ctx->buf.lock, "pttopa", NULL, MTX_SPIN);
+       pt_ctx->save_area = malloc_aligned(pt_info.xsave_area_size, 64,
+           M_PT, M_NOWAIT | M_ZERO);
+       if (pt_ctx->save_area == NULL)
+               return (ENOMEM);
+       dprintf("%s: preparing ToPA buffer\n", __func__);
+       if (pt_topa_prepare(pt_ctx, vm) != 0) {
+               dprintf("%s: failed to prepare ToPA buffer\n", __func__);
+               free(pt_ctx->save_area, M_PT);
+               return (ENOMEM);
+       }
+
+       pt_ctx->id = ctx_id;
+       TASK_INIT(&pt_ctx->task, 0, pt_send_buffer_record, pt_ctx);
+
+       return (0);
+}
+
+static void
+pt_deinit_ctx(struct pt_ctx *pt_ctx)
+{
+
+       if (pt_ctx->buf.topa_hw != NULL)
+               free(pt_ctx->buf.topa_hw, M_PT);
+       if (pt_ctx->save_area != NULL)
+               free(pt_ctx->save_area, M_PT);
+       memset(pt_ctx, 0, sizeof(*pt_ctx));
+       pt_ctx->buf.topa_hw = NULL;
+}
+
+/*
+ * HWT backend configuration method.
+ *
+ * Checks and translates the user-defined configuration to a
+ * set of PT tracing features. Uses the feature set to initialize
+ * the tracing context for the target CPU or thread.
+ */
+static int
+pt_backend_configure(struct hwt_context *ctx, int cpu_id, int thread_id)
+{
+       struct hwt_cpu *hwt_cpu;
+       struct hwt_thread *thr;
+       struct pt_ctx *pt_ctx;
+       struct pt_cpu_config *cfg;
+       struct pt_ext_area *pt_ext;
+       struct xstate_hdr *hdr;
+       int error;
+
+       dprintf("%s\n", __func__);
+
+       cfg = (struct pt_cpu_config *)ctx->config;
+       pt_ctx = NULL;
+
+       /* Clear any flags we don't support yet. */
+       cfg->rtit_ctl &= PT_SUPPORTED_FLAGS;
+       if (cfg->rtit_ctl & RTIT_CTL_MTCEN) {
+               if ((pt_info.l0_ebx & CPUPT_MTC) == 0) {
+                       printf("%s: CPU does not support generating MTC "
+                           "packets\n", __func__);
+                       return (ENXIO);
+               }
+       }
+
+       if (cfg->rtit_ctl & RTIT_CTL_CR3FILTER) {
+               if ((pt_info.l0_ebx & CPUPT_CR3) == 0) {
+                       printf("%s: CPU does not support CR3 filtering\n",
+                           __func__);
+                       return (ENXIO);
+               }
+       }
+
+       if (cfg->rtit_ctl & RTIT_CTL_DIS_TNT) {
+               if ((pt_info.l0_ebx & CPUPT_DIS_TNT) == 0) {
+                       printf("%s: CPU does not support TNT\n", __func__);
+                       return (ENXIO);
+               }
+       }
+       /* TODO: support for more config bits. */
+
+       if (ctx->mode == HWT_MODE_CPU) {
+               TAILQ_FOREACH(hwt_cpu, &ctx->cpus, next) {
+                       if (hwt_cpu->cpu_id != cpu_id)
+                               continue;
+                       pt_ctx = &pt_pcpu_ctx[cpu_id];
+                       break;
+               }
+       } else {
+               TAILQ_FOREACH(thr, &ctx->threads, next) {
+                       if (thr->thread_id != thread_id)
+                               continue;
+                       KASSERT(thr->private != NULL,
+                           ("%s: hwt thread private"
+                            " not set, thr %p",
+                               __func__, thr));
+                       pt_ctx = (struct pt_ctx *)thr->private;
+                       break;
+               }
+       }
+       if (pt_ctx == NULL)
+               return (ENOENT);
+
+       dprintf("%s: preparing MSRs\n", __func__);
+       pt_ext = pt_ctx_get_ext_area(pt_ctx);
+       hdr = pt_ctx_get_xstate_hdr(pt_ctx);
+
+       pt_ext->rtit_ctl |= cfg->rtit_ctl;
+       if (cfg->nranges != 0) {
+               dprintf("%s: preparing IPF ranges\n", __func__);
+               if ((error = pt_configure_ranges(pt_ctx, cfg)) != 0)
+                       return (error);
+       }
+       pt_ctx->hwt_ctx = ctx;
+       pt_ext->rtit_ctl |= RTIT_CTL_TOPA;
+       pt_ext->rtit_output_base = (uint64_t)vtophys(pt_ctx->buf.topa_hw);
+       pt_ext->rtit_output_mask_ptrs = PT_TOPA_MASK_PTRS;
+       hdr->xstate_bv = XFEATURE_ENABLED_PT;
+       hdr->xstate_xcomp_bv = XFEATURE_ENABLED_PT |
+           XSTATE_XCOMP_BV_COMPACT;
+       pt_ext->rtit_ctl |= RTIT_CTL_TRACEEN;
+       pt_pcpu[cpu_id].ctx = pt_ctx;
+       pt_cpu_set_state(cpu_id, PT_STOPPED);
+
+       return (0);
+}
+
+/*
+ * hwt backend trace start operation. CPU affine.
+ */
+static void
+pt_backend_enable(struct hwt_context *ctx, int cpu_id)
+{
+       if (ctx->mode == HWT_MODE_CPU)
+               return;
+
+       KASSERT(curcpu == cpu_id,
+           ("%s: attempting to start PT on another cpu", __func__));
+       pt_cpu_start(NULL);
+       CPU_SET(cpu_id, &ctx->cpu_map);
+}
+
+/*
+ * hwt backend trace stop operation. CPU affine.
+ */
+static void
+pt_backend_disable(struct hwt_context *ctx, int cpu_id)
+{
+       struct pt_cpu *cpu;
+
+       if (ctx->mode == HWT_MODE_CPU)
+               return;
+
+       KASSERT(curcpu == cpu_id,
+           ("%s: attempting to disable PT on another cpu", __func__));
+       pt_cpu_stop(NULL);
+       CPU_CLR(cpu_id, &ctx->cpu_map);
+       cpu = &pt_pcpu[cpu_id];
+       cpu->ctx = NULL;
+}
+
+/*
+ * hwt backend trace start operation for remote CPUs.
+ */
+static int
+pt_backend_enable_smp(struct hwt_context *ctx)
+{
+
+       dprintf("%s\n", __func__);
+       if (ctx->mode == HWT_MODE_CPU &&
+           atomic_swap_32(&cpu_mode_ctr, 1) != 0)
+               return (-1);
+
+       KASSERT(ctx->mode == HWT_MODE_CPU,
+           ("%s: should only be used for CPU mode", __func__));
+       smp_rendezvous_cpus(ctx->cpu_map, NULL, pt_cpu_start, NULL, NULL);
+
+       return (0);
+}
+
+/*
+ * hwt backend trace stop operation for remote CPUs.
+ */
+static int
+pt_backend_disable_smp(struct hwt_context *ctx)
+{
+
+       dprintf("%s\n", __func__);
+       if (ctx->mode == HWT_MODE_CPU &&
+           atomic_swap_32(&cpu_mode_ctr, 0) == 0)
+               return (-1);
+
+       if (CPU_EMPTY(&ctx->cpu_map)) {
+               dprintf("%s: empty cpu map\n", __func__);
+               return (-1);
+       }
+       smp_rendezvous_cpus(ctx->cpu_map, NULL, pt_cpu_stop, NULL, NULL);
+
+       return (0);
+}
+
+/*
+ * HWT backend initialization method.
+ *
+ * Installs the ToPA interrupt handler and initializes
+ * the tracing contexts used for HWT_MODE_CPU.
+ */
+static int
+pt_backend_init(struct hwt_context *ctx)
+{
+       struct hwt_cpu *hwt_cpu;
+       int error;
+
+       dprintf("%s\n", __func__);
+       if (ctx->mode == HWT_MODE_CPU) {
+               TAILQ_FOREACH(hwt_cpu, &ctx->cpus, next) {
+                       error = pt_init_ctx(&pt_pcpu_ctx[hwt_cpu->cpu_id],
+                           hwt_cpu->vm, hwt_cpu->cpu_id);
+                       if (error)
+                               return (error);
+               }
+       }
+
+       return (0);
+}
+
+/*
+ * HWT backend teardown method.
+ *
+ * Removes the ToPA interrupt handler, stops tracing on all active CPUs,
+ * and releases all previously allocated ToPA metadata.
+ */
+static int
+pt_backend_deinit(struct hwt_context *ctx)
+{
+       struct pt_ctx *pt_ctx;
+       struct hwt_thread *thr;
+       int cpu_id;
+
+       dprintf("%s\n", __func__);
+
+       pt_backend_disable_smp(ctx);
+       if (ctx->mode == HWT_MODE_THREAD) {
+               TAILQ_FOREACH(thr, &ctx->threads, next) {
+                       KASSERT(thr->private != NULL,
+                           ("%s: thr->private not set", __func__));
+                       pt_ctx = (struct pt_ctx *)thr->private;
+                       pt_deinit_ctx(pt_ctx);
+               }
+       } else {
+               CPU_FOREACH(cpu_id) {
+                       if (!CPU_ISSET(cpu_id, &ctx->cpu_map))
+                               continue;
+                       if (pt_pcpu[cpu_id].ctx != NULL) {
+                               KASSERT(pt_pcpu[cpu_id].ctx ==
+                                       &pt_pcpu_ctx[cpu_id],
+                                   ("%s: CPU mode tracing with non-cpu mode PT"
+                                    "context active",
+                                       __func__));
+                               pt_pcpu[cpu_id].ctx = NULL;
+                       }
+                       pt_ctx = &pt_pcpu_ctx[cpu_id];
+                       pt_deinit_ctx(pt_ctx);
+                       memset(&pt_pcpu[cpu_id], 0, sizeof(struct pt_cpu));
+               }
+       }
+
+       return (0);
+}
+
+/*
+ * Fetches current offset into the tracing buffer.
+ */
+static int
+pt_backend_read(struct hwt_vm *vm, int *curpage, vm_offset_t *curpage_offset,
+    uint64_t *data)
+{
+       struct pt_buffer *buf;
+
+       if (vm->ctx->mode == HWT_MODE_THREAD)
+               buf = &((struct pt_ctx *)vm->thr->private)->buf;
+       else
+               buf = &pt_pcpu[vm->cpu->cpu_id].ctx->buf;
+       mtx_lock_spin(&buf->lock);
+       *curpage = buf->curpage;
+       *curpage_offset = buf->offset + (buf->wrap_count * vm->ctx->bufsize);
+       mtx_unlock_spin(&buf->lock);
+
+       return (0);
+}
+
+/*
+ * HWT thread creation hook.
+ * Allocates and associates a 'struct pt_ctx' for a given hwt thread.
+ */
+static int
+pt_backend_alloc_thread(struct hwt_thread *thr)
+{
+       struct pt_ctx *pt_ctx;
+       int error;
+
+       /* Omit M_WAITOK since this might get invoked a non-sleepable context */
+       pt_ctx = malloc(sizeof(*pt_ctx), M_PT, M_NOWAIT | M_ZERO);
+       if (pt_ctx == NULL)
+               return (ENOMEM);
+
+       error = pt_init_ctx(pt_ctx, thr->vm, thr->thread_id);
+       if (error)
+               return (error);
+
+       thr->private = pt_ctx;
+       return (0);
+}
+/*
+ * HWT thread teardown hook.
+ */
+static void
+pt_backend_free_thread(struct hwt_thread *thr)
+{
+       struct pt_ctx *ctx;
+
+       ctx = (struct pt_ctx *)thr->private;
+
+       pt_deinit_ctx(ctx);
+       free(ctx, M_PT);
+}
+
+static void
+pt_backend_dump(int cpu_id)
+{
+}
+
+static struct hwt_backend_ops pt_ops = {
+       .hwt_backend_init = pt_backend_init,
+       .hwt_backend_deinit = pt_backend_deinit,
+
+       .hwt_backend_configure = pt_backend_configure,
+
+       .hwt_backend_enable = pt_backend_enable,
+       .hwt_backend_disable = pt_backend_disable,
+
+#ifdef SMP
+       .hwt_backend_enable_smp = pt_backend_enable_smp,
+       .hwt_backend_disable_smp = pt_backend_disable_smp,
+#endif
+
+       .hwt_backend_read = pt_backend_read,
+       .hwt_backend_dump = pt_backend_dump,
+
+       .hwt_backend_thread_alloc = pt_backend_alloc_thread,
+       .hwt_backend_thread_free = pt_backend_free_thread,
+};
+
+static struct hwt_backend backend = {
+       .ops = &pt_ops,
+       .name = "pt",
+       .kva_req = 1,
+};
+
+/*
+ * Reads the latest valid trace buffer offset and enqueues
+ * a HWT_RECORD_BUFFER record.
+ * Used as a taskqueue routine from the ToPA interrupt handler.
+ */
+static void
+pt_send_buffer_record(void *arg, int pending __unused)
+{
+       struct hwt_record_entry record;
+       struct pt_ctx *ctx = (struct pt_ctx *)arg;
+
+       /* Prepare buffer record. */
+       mtx_lock_spin(&ctx->buf.lock);
+       pt_fill_buffer_record(ctx->id, &ctx->buf, &record);
+       mtx_unlock_spin(&ctx->buf.lock);
+       hwt_record_ctx(ctx->hwt_ctx, &record, M_ZERO | M_NOWAIT);
+}
+static void
+pt_topa_status_clear(void)
+{
+       uint64_t reg;
+
+       reg = rdmsr(MSR_IA_GLOBAL_STATUS_RESET);
+       reg &= ~GLOBAL_STATUS_FLAG_TRACETOPAPMI;
+       reg |= GLOBAL_STATUS_FLAG_TRACETOPAPMI;
+       wrmsr(MSR_IA_GLOBAL_STATUS_RESET, reg);
+}
+
+/*
+ * ToPA PMI handler.
+ *
+ * Invoked every time a ToPA entry marked with TOPA_INT is filled.
+ * Uses taskqueue to enqueue a buffer record for userspace.
+ * Re-enables the PC interrupt line as long as tracing is active.
+ */
+static int
+pt_topa_intr(struct trapframe *tf)
+{
+       struct pt_buffer *buf;
+       struct pt_ctx *ctx;
+       uint64_t reg;
+
+       SDT_PROBE0(pt, , , topa__intr);
+
+       if (pt_cpu_get_state(curcpu) != PT_ACTIVE) {
+               return (0);
+       }
+       reg = rdmsr(MSR_IA_GLOBAL_STATUS);
+       if ((reg & GLOBAL_STATUS_FLAG_TRACETOPAPMI) == 0) {
+               /* ACK spurious or leftover interrupt. */
+               pt_topa_status_clear();
+               return (1);
+       }
+
+       ctx = pt_pcpu[curcpu].ctx;
+       buf = &ctx->buf;
+       KASSERT(buf->topa_hw != NULL,
+           ("%s: ToPA PMI interrupt with invalid buffer", __func__));
+
+       pt_cpu_toggle_local(ctx->save_area, false);
+       pt_update_buffer(buf);
+       pt_topa_status_clear();
+       taskqueue_enqueue_flags(taskqueue_pt, &ctx->task,
+           TASKQUEUE_FAIL_IF_PENDING);
+
+       if (pt_cpu_get_state(curcpu) == PT_ACTIVE) {
+               pt_cpu_toggle_local(ctx->save_area, true);
+               lapic_reenable_pcint();
+       }
+       return (1);
+}
+
+/*
+ * Module initialization.
+ *
+ * Saves all PT-related cpuid info, registers itself as a HWT backend,
+ * and allocates metadata required to keep track of tracing operations
+ * on each CPU.
+ */
+static int
+pt_init(void)
+{
+       u_int cp[4];
+       int error;
+
+       dprintf("pt: Enumerating part 1\n");
+       cpuid_count(CPUID_PT_LEAF, 0, cp);
+       dprintf("pt: Maximum valid sub-leaf Index: %x\n", cp[0]);
+       dprintf("pt: ebx %x\n", cp[1]);
+       dprintf("pt: ecx %x\n", cp[2]);
+
+       pt_info.l0_eax = cp[0];
+       pt_info.l0_ebx = cp[1];
+       pt_info.l0_ecx = cp[2];
+
+       dprintf("pt: Enumerating part 2\n");
+       cpuid_count(CPUID_PT_LEAF, 1, cp);
+       dprintf("pt: eax %x\n", cp[0]);
+       dprintf("pt: ebx %x\n", cp[1]);
+
+       pt_info.l1_eax = cp[0];
+       pt_info.l1_ebx = cp[1];
+
+       error = hwt_backend_register(&backend);
+       if (error != 0) {
+               printf("pt: unable to register hwt backend, error %d\n", error);
+               return (error);
+       }
+       pt_pcpu = mallocarray(mp_ncpus, sizeof(struct pt_cpu), M_PT,
+           M_ZERO | M_WAITOK);
+       pt_pcpu_ctx = mallocarray(mp_ncpus, sizeof(struct pt_ctx), M_PT,
+           M_ZERO | M_WAITOK);
+
+       nmi_register_handler(pt_topa_intr);
+       if (!lapic_enable_pcint()) {
+               nmi_remove_handler(pt_topa_intr);
+               hwt_backend_unregister(&backend);
+               free(pt_pcpu, M_PT);
+               free(pt_pcpu_ctx, M_PT);
+               pt_pcpu = NULL;
+               pt_pcpu_ctx = NULL;
+               printf("pt: failed to setup interrupt line\n");
+               return (error);
+       }
+       initialized = true;
+
+       return (0);
+}
+
+/*
+ * Checks whether the CPU support Intel PT and
+ * initializes XSAVE area info.
+ *
+ * The driver relies on XSAVE/XRSTOR PT extensions,
+ * Table of Physical Addresses (ToPA) support, and
+ * support for multiple ToPA entries.
+ */
+static bool
+pt_supported(void)
+{
+       u_int cp[4];
+
+       if ((cpu_stdext_feature & CPUID_STDEXT_PROCTRACE) == 0) {
+               printf("pt: CPU does not support Intel Processor Trace\n");
+               return (false);
+       }
+       if ((cpu_feature2 & CPUID2_XSAVE) == 0) {
+               printf("pt: XSAVE is not supported\n");
+               return (false);
+       }
+       if (!xsave_extfeature_supported(XFEATURE_ENABLED_PT, true)) {
+               printf("pt: CPU does not support managing PT state using 
XSAVE\n");
+               return (false);
+       }
+       if (!xsave_extension_supported(CPUID_EXTSTATE_XSAVEC)) {
+               printf("pt: XSAVE compaction is not supported\n");
+               return (false);
+       }
+       if (!xsave_extension_supported(CPUID_EXTSTATE_XSAVES)) {
+               printf("pt: CPU does not support XSAVES/XRSTORS\n");
+               return (false);
+       }
+
+       /* Require ToPA support. */
+       cpuid_count(CPUID_PT_LEAF, 0, cp);
+       if ((cp[2] & CPUPT_TOPA) == 0) {
+               printf("pt: ToPA is not supported\n");
+               return (false);
+       }
+       if ((cp[2] & CPUPT_TOPA_MULTI) == 0) {
+               printf("pt: multiple ToPA outputs are not supported\n");
+               return (false);
+       }
+
+       pt_info.xstate_hdr_offset = xsave_area_hdr_offset();
+       pt_info.xsave_area_size = xsave_area_size(PT_XSTATE_BV, true, true);
+       pt_info.pt_xsave_offset = xsave_area_offset(PT_XSTATE_BV,
+           XFEATURE_ENABLED_PT, true, true);
+
+       return (true);
+}
+
+static void
+pt_deinit(void)
+{
+       if (!initialized)
+               return;
+       nmi_remove_handler(pt_topa_intr);
+       lapic_disable_pcint();
+       hwt_backend_unregister(&backend);
+       free(pt_pcpu, M_PT);
+       free(pt_pcpu_ctx, M_PT);
+       pt_pcpu = NULL;
+       initialized = false;
+}
+
+static int
+pt_modevent(module_t mod, int type, void *data)
+{
+       switch (type) {
+       case MOD_LOAD:
+               if (!pt_supported() || pt_init() != 0) {
*** 107 LINES SKIPPED ***

Reply via email to