This patch provides Intel PT logging feature. When system boots with a parameter "intel_pt_log", log buffers for Intel PT are allocated and logging starts, then processor flow information is written in the log buffer by hardware like flight recorder. This is very helpful to investigate a cause of kernel panic.
The log buffer size is specified by the parameter "intel_pt_log_buf_len=<size>". This buffer is used as circular buffer, therefore old events are overwritten by new events. Signed-off-by: Takao Indoh <indou.ta...@jp.fujitsu.com> --- arch/x86/Kconfig | 16 +++ arch/x86/include/asm/intel_pt_log.h | 13 ++ arch/x86/kernel/cpu/Makefile | 2 + arch/x86/kernel/cpu/intel_pt_log.c | 178 +++++++++++++++++++++++++++++ arch/x86/kernel/cpu/perf_event_intel_pt.c | 6 + 5 files changed, 215 insertions(+), 0 deletions(-) create mode 100644 arch/x86/include/asm/intel_pt_log.h create mode 100644 arch/x86/kernel/cpu/intel_pt_log.c diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f37010f..2b99ba2 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1722,6 +1722,22 @@ config X86_INTEL_MPX If unsure, say N. +config X86_INTEL_PT_LOG + prompt "Intel PT logger" + def_bool n + depends on PERF_EVENTS && CPU_SUP_INTEL + ---help--- + Intel PT is a hardware features that can capture information + about program execution flow. Once Intel PT is enabled, the + events which change program flow, like branch instructions, + exceptions, interruptions, traps and so on are logged in + the memory. + + This option enables starting Intel PT logging feature at boot + time. When kernel panic occurs, Intel PT log buffer can be + retrieved from crash dump file and enables to reconstruct the + detailed flow that led to the panic. + config EFI bool "EFI runtime service support" depends on ACPI diff --git a/arch/x86/include/asm/intel_pt_log.h b/arch/x86/include/asm/intel_pt_log.h new file mode 100644 index 0000000..cef63f7 --- /dev/null +++ b/arch/x86/include/asm/intel_pt_log.h @@ -0,0 +1,13 @@ +#ifndef __INTEL_PT_LOG_H__ +#define __INTEL_PT_LOG_H__ + +#if defined(CONFIG_X86_INTEL_PT_LOG) + +#include <linux/perf_event.h> + +void pt_log_start(struct pmu *pmu); +void save_intel_pt_registers(void); + +#endif + +#endif /* __INTEL_PT_LOG_H__ */ diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 4eb065c..67c17f0 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -48,6 +48,8 @@ obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += perf_event_intel_uncore.o \ perf_event_intel_uncore_nhmex.o obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_msr.o obj-$(CONFIG_CPU_SUP_AMD) += perf_event_msr.o + +obj-$(CONFIG_X86_INTEL_PT_LOG) += intel_pt_log.o endif diff --git a/arch/x86/kernel/cpu/intel_pt_log.c b/arch/x86/kernel/cpu/intel_pt_log.c new file mode 100644 index 0000000..eb345fd --- /dev/null +++ b/arch/x86/kernel/cpu/intel_pt_log.c @@ -0,0 +1,178 @@ +/* + * Intel Processor Trace Logger + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/trace_events.h> +#include <asm/intel_pt_log.h> + +#define SAMPLE_TYPE_BASE \ + (PERF_SAMPLE_IP|PERF_SAMPLE_TID|PERF_SAMPLE_TIME|PERF_SAMPLE_IDENTIFIER) +#define SAMPLE_TYPE_PT \ + (SAMPLE_TYPE_BASE|PERF_SAMPLE_CPU|PERF_SAMPLE_RAW) +#define SAMPLE_TYPE_SCHED \ + (SAMPLE_TYPE_BASE|PERF_SAMPLE_CPU|PERF_SAMPLE_PERIOD|PERF_SAMPLE_RAW) +#define SAMPLE_TYPE_DUMMY \ + (SAMPLE_TYPE_BASE) + +/* intel_pt */ +static struct perf_event_attr pt_attr_pt = { + .config = 0x400, /* bit10: TSCEn */ + .size = sizeof(struct perf_event_attr), + .sample_type = SAMPLE_TYPE_PT, + .read_format = PERF_FORMAT_ID, + .inherit = 1, + .pinned = 1, + .sample_id_all = 1, + .exclude_guest = 1 +}; + +/* sched:sched_switch */ +static struct perf_event_attr pt_attr_sched = { + .type = PERF_TYPE_TRACEPOINT, + .size = sizeof(struct perf_event_attr), + .sample_type = SAMPLE_TYPE_SCHED, + .read_format = PERF_FORMAT_ID, + .inherit = 1, + .sample_id_all = 1, + .exclude_guest = 1 +}; + +/* dummy:u */ +static struct perf_event_attr pt_attr_dummy = { + .type = PERF_TYPE_SOFTWARE, + .config = PERF_COUNT_SW_DUMMY, + .size = sizeof(struct perf_event_attr), + .sample_type = SAMPLE_TYPE_DUMMY, + .read_format = PERF_FORMAT_ID, + .inherit = 1, + .exclude_kernel = 1, + .exclude_hv = 1, + .comm = 1, + .task = 1, + .sample_id_all = 1, + .comm_exec = 1 +}; + +static int pt_log_enabled; +static int pt_log_buf_nr_pages = 128; /* number of pages for log buffer */ +static struct cpumask pt_log_cpu_mask; + +static DEFINE_PER_CPU(struct perf_event *, pt_perf_event_pt); +static DEFINE_PER_CPU(struct perf_event *, pt_perf_event_sched); +static DEFINE_PER_CPU(struct perf_event *, pt_perf_event_dummy); + +/* Saved registers on panic */ +static DEFINE_PER_CPU(u64, saved_msr_ctl); +static DEFINE_PER_CPU(u64, saved_msr_status); +static DEFINE_PER_CPU(u64, saved_msr_output_base); +static DEFINE_PER_CPU(u64, saved_msr_output_mask); + +void save_intel_pt_registers(void) +{ + int cpu = smp_processor_id(); + u64 ctl; + + if (!cpumask_test_cpu(cpu, &pt_log_cpu_mask)) + return; + + /* Save RTIT_CTL register */ + rdmsrl(MSR_IA32_RTIT_CTL, ctl); + per_cpu(saved_msr_ctl, cpu) = ctl; + + /* Stop tracing */ + ctl &= ~RTIT_CTL_TRACEEN; + wrmsrl(MSR_IA32_RTIT_CTL, ctl); + + /* Save other registers */ + rdmsrl(MSR_IA32_RTIT_STATUS, per_cpu(saved_msr_status, cpu)); + rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, per_cpu(saved_msr_output_base, cpu)); + rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, per_cpu(saved_msr_output_mask, cpu)); +} + +static int pt_enable_kernel_counter(int cpu) +{ + struct perf_event *event = NULL; + + /* Create counter for intel_pt */ + event = perf_event_create_kernel_counter_with_buffer(&pt_attr_pt, + cpu, NULL, NULL, NULL, 0, + pt_log_buf_nr_pages, pt_log_buf_nr_pages, event); + + if (IS_ERR(event)) { + pr_err("failed to create counter for pt: cpu=%d, err=%d\n", + cpu, IS_ERR(event)); + return -1; + } + per_cpu(pt_perf_event_pt, cpu) = event; + + /* Create counter for side-band data (sched:sched_switch) */ + event = perf_event_create_kernel_counter_with_buffer(&pt_attr_sched, + cpu, NULL, NULL, NULL, 0, 0, 0, event); + + if (IS_ERR(event)) + pr_warn("failed to create counter for sched: cpu=%d, err=%d\n", + cpu, IS_ERR(event)); + else + per_cpu(pt_perf_event_sched, cpu) = event; + + /* Create counter for side-band data (dummy:u) */ + event = perf_event_create_kernel_counter_with_buffer(&pt_attr_dummy, + cpu, NULL, NULL, NULL, 0, 0, 0, event); + + if (IS_ERR(event)) + pr_warn("failed to create counter for dummy: cpu=%d, err=%d\n", + cpu, IS_ERR(event)); + else + per_cpu(pt_perf_event_dummy, cpu) = event; + + return 0; +} + +static __init int pt_log_buf_setup(char *str) +{ + int len; + + if (get_option(&str, &len)) + pt_log_buf_nr_pages = len>>PAGE_SHIFT; + + return 1; +} +__setup("intel_pt_log_buf_len", pt_log_buf_setup); + +static __init int pt_log_setup(char *str) +{ + pt_log_enabled = 1; + return 1; +} +__setup("intel_pt_log", pt_log_setup); + +__init void pt_log_start(struct pmu *pmu) +{ + int cpu, type; + + cpumask_clear(&pt_log_cpu_mask); + + if (!pt_log_enabled) + return; + + type = perf_trace_event_get_type_by_name("sched", "sched_switch"); + if (!type) { + pr_err("Cannot find sched:sched_switch event\n"); + return; + } + + pt_attr_sched.config = type; + pt_attr_sched.sample_period = 1; + pt_attr_pt.type = pmu->type; + + get_online_cpus(); + for_each_online_cpu(cpu) { + if (!pt_enable_kernel_counter(cpu)) + cpumask_set_cpu(cpu, &pt_log_cpu_mask); + } + put_online_cpus(); +} + diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c index 4216928..5154670 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_pt.c +++ b/arch/x86/kernel/cpu/perf_event_intel_pt.c @@ -27,6 +27,7 @@ #include <asm/perf_event.h> #include <asm/insn.h> #include <asm/io.h> +#include <asm/intel_pt_log.h> #include "perf_event.h" #include "intel_pt.h" @@ -1173,6 +1174,11 @@ static __init int pt_init(void) pt_pmu.pmu.free_aux = pt_buffer_free_aux; ret = perf_pmu_register(&pt_pmu.pmu, "intel_pt", -1); +#ifdef CONFIG_X86_INTEL_PT_LOG + if (!ret) + pt_log_start(&pt_pmu.pmu); +#endif + return ret; } arch_initcall(pt_init); -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/