Change from v2
 - Print entry->ip instead of entry->regs->ip to avoid kernel crash.
 - Use %pf instead of 0x%lx to print address and ip.

This patch introduces page fault tracepoints to x86 architecture
by switching IDT.

[Use case of page fault events]

  Two events, for user and kernel spaces, are introduced at the beginning of
  page fault handler.

  - User space event
    There is a request of page fault event for user space as below.

    http://marc.info/?l=linux-mm&m=136807959830182&w=2
    http://marc.info/?l=linux-mm&m=136807959130175&w=2

  - Kernel space event:
    Overhead in kernel space is measurable by enabling it.

[Creating IDT]

 A way to create IDT is as below.

 - Introduce set_intr_gate_raw() to register just non-trace handler to IDT.
   This is used at boot time which tracing is disabled.
 - Make set_intr_gate() macro so that it can register trace handler to
   trace IDT and non-trace handler to normal IDT.

Signed-off-by: Seiji Aguchi <seiji.agu...@hds.com>
---
 arch/x86/include/asm/desc.h             | 33 +++++++++++++++++----
 arch/x86/include/asm/hw_irq.h           | 14 ++++++++-
 arch/x86/include/asm/trace/exceptions.h | 52 +++++++++++++++++++++++++++++++++
 arch/x86/include/asm/traps.h            | 22 ++++++++++++++
 arch/x86/kernel/entry_32.S              | 10 +++++++
 arch/x86/kernel/entry_64.S              | 13 ++++++++-
 arch/x86/kernel/head64.c                |  2 +-
 arch/x86/kernel/irqinit.c               |  2 +-
 arch/x86/kernel/kvm.c                   |  2 +-
 arch/x86/kernel/traps.c                 | 28 +++++++++---------
 arch/x86/mm/Makefile                    |  2 ++
 arch/x86/mm/fault.c                     | 22 ++++++++++++++
 12 files changed, 178 insertions(+), 24 deletions(-)
 create mode 100644 arch/x86/include/asm/trace/exceptions.h

diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index b90e5df..c04302b 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -327,10 +327,28 @@ static inline void write_trace_idt_entry(int entry, const 
gate_desc *gate)
 {
        write_idt_entry(trace_idt_table, entry, gate);
 }
+
+static inline void _trace_set_gate(int gate, unsigned type, void *addr,
+                                  unsigned dpl, unsigned ist, unsigned seg)
+{
+       gate_desc s;
+
+       pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg);
+       /*
+        * does not need to be atomic because it is only done once at
+        * setup time
+        */
+       write_trace_idt_entry(gate, &s);
+}
 #else
 static inline void write_trace_idt_entry(int entry, const gate_desc *gate)
 {
 }
+
+static inline void _trace_set_gate(int gate, unsigned type, void *addr,
+                                  unsigned dpl, unsigned ist, unsigned seg)
+{
+}
 #endif
 
 static inline void _set_gate(int gate, unsigned type, void *addr,
@@ -353,12 +371,20 @@ static inline void _set_gate(int gate, unsigned type, 
void *addr,
  * Pentium F0 0F bugfix can have resulted in the mapped
  * IDT being write-protected.
  */
-static inline void set_intr_gate(unsigned int n, void *addr)
+static inline void set_intr_gate_raw(unsigned int n, void *addr)
 {
        BUG_ON((unsigned)n > 0xFF);
        _set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS);
 }
 
+#define set_intr_gate(n, addr)                                         \
+       do {                                                            \
+               BUG_ON((unsigned)n > 0xFF);                             \
+               _set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS);  \
+               _trace_set_gate(n, GATE_INTERRUPT, trace_##addr, 0, 0,  \
+                               __KERNEL_CS);                           \
+       } while (0)
+
 extern int first_system_vector;
 /* used_vectors is BITMAP for irq is not managed by percpu vector_irq */
 extern unsigned long used_vectors[];
@@ -395,10 +421,7 @@ static inline void trace_set_intr_gate(unsigned int gate, 
void *addr)
 #define __trace_alloc_intr_gate(n, addr)
 #endif
 
-static inline void __alloc_intr_gate(unsigned int n, void *addr)
-{
-       set_intr_gate(n, addr);
-}
+#define __alloc_intr_gate(n, addr) set_intr_gate(n, addr)
 
 #define alloc_intr_gate(n, addr)                               \
        do {                                                    \
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 92b3bae..c856e69 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -89,10 +89,22 @@ extern void trace_reschedule_interrupt(void);
 extern void trace_threshold_interrupt(void);
 extern void trace_call_function_interrupt(void);
 extern void trace_call_function_single_interrupt(void);
+#else /* CONFIG_TRACING */
+#define trace_apic_timer_interrupt apic_timer_interrupt
+#define trace_x86_platform_ipi x86_platform_ipi
+#define trace_error_interrupt error_interrupt
+#define trace_irq_work_interrupt irq_work_interrupt
+#define trace_spurious_interrupt spurious_interrupt
+#define trace_thermal_interrupt thermal_interrupt
+#define trace_reschedule_interrupt reschedule_interrupt
+#define trace_threshold_interrupt threshold_interrupt
+#define trace_call_function_interrupt call_function_interrupt
+#define trace_call_function_single_interrupt call_function_single_interrupt
+#endif
+
 #define trace_irq_move_cleanup_interrupt  irq_move_cleanup_interrupt
 #define trace_reboot_interrupt  reboot_interrupt
 #define trace_kvm_posted_intr_ipi kvm_posted_intr_ipi
-#endif /* CONFIG_TRACING */
 
 /* IOAPIC */
 #define IO_APIC_IRQ(x) (((x) >= NR_IRQS_LEGACY) || ((1<<(x)) & io_apic_irqs))
diff --git a/arch/x86/include/asm/trace/exceptions.h 
b/arch/x86/include/asm/trace/exceptions.h
new file mode 100644
index 0000000..86540c0
--- /dev/null
+++ b/arch/x86/include/asm/trace/exceptions.h
@@ -0,0 +1,52 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM exceptions
+
+#if !defined(_TRACE_PAGE_FAULT_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_PAGE_FAULT_H
+
+#include <linux/tracepoint.h>
+
+extern void trace_irq_vector_regfunc(void);
+extern void trace_irq_vector_unregfunc(void);
+
+DECLARE_EVENT_CLASS(x86_exceptions,
+
+       TP_PROTO(unsigned long address, struct pt_regs *regs,
+                unsigned long error_code),
+
+       TP_ARGS(address, regs, error_code),
+
+       TP_STRUCT__entry(
+               __field(                unsigned long, address  )
+               __field(                unsigned long, ip       )
+               __field(                unsigned long, error_code )
+       ),
+
+       TP_fast_assign(
+               __entry->address = address;
+               __entry->ip = regs->ip;
+               __entry->error_code = error_code;
+       ),
+
+       TP_printk("address=%pf ip=%pf error_code=0x%lx",
+                 (void *)__entry->address, (void *)__entry->ip,
+                 __entry->error_code) );
+
+#define DEFINE_PAGE_FAULT_EVENT(name)                          \
+DEFINE_EVENT_FN(x86_exceptions, name,                          \
+       TP_PROTO(unsigned long address, struct pt_regs *regs,   \
+                unsigned long error_code),                     \
+       TP_ARGS(address, regs, error_code),                     \
+       trace_irq_vector_regfunc,                               \
+       trace_irq_vector_unregfunc);
+
+DEFINE_PAGE_FAULT_EVENT(user_page_fault);
+DEFINE_PAGE_FAULT_EVENT(kernel_page_fault);
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE exceptions
+#endif /*  _TRACE_PAGE_FAULT_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 7036cb6..a400a22 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -37,6 +37,25 @@ asmlinkage void machine_check(void);
 #endif /* CONFIG_X86_MCE */
 asmlinkage void simd_coprocessor_error(void);
 
+#ifdef CONFIG_TRACING
+asmlinkage void trace_page_fault(void);
+#else
+#define trace_page_fault page_fault
+#endif
+#define trace_divide_error divide_error
+#define trace_bounds bounds
+#define trace_invalid_op invalid_op
+#define trace_device_not_available device_not_available
+#define trace_coprocessor_segment_overrun coprocessor_segment_overrun
+#define trace_invalid_TSS invalid_TSS
+#define trace_segment_not_present segment_not_present
+#define trace_general_protection general_protection
+#define trace_spurious_interrupt_bug spurious_interrupt_bug
+#define trace_coprocessor_error coprocessor_error
+#define trace_alignment_check alignment_check
+#define trace_simd_coprocessor_error simd_coprocessor_error
+#define trace_async_page_fault async_page_fault
+
 dotraplinkage void do_divide_error(struct pt_regs *, long);
 dotraplinkage void do_debug(struct pt_regs *, long);
 dotraplinkage void do_nmi(struct pt_regs *, long);
@@ -55,6 +74,9 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs 
*);
 #endif
 dotraplinkage void do_general_protection(struct pt_regs *, long);
 dotraplinkage void do_page_fault(struct pt_regs *, unsigned long);
+#ifdef CONFIG_TRACING
+dotraplinkage void trace_do_page_fault(struct pt_regs *, unsigned long);
+#endif
 dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *, long);
 dotraplinkage void do_coprocessor_error(struct pt_regs *, long);
 dotraplinkage void do_alignment_check(struct pt_regs *, long);
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 2cfbc3a..c9eb4e2 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1244,6 +1244,16 @@ return_to_handler:
  */
        .pushsection .kprobes.text, "ax"
 
+#ifdef CONFIG_TRACING
+ENTRY(trace_page_fault)
+       RING0_EC_FRAME
+       ASM_CLAC
+       pushl_cfi $trace_do_page_fault
+       jmp error_code
+       CFI_ENDPROC
+END(trace_page_fault)
+#endif
+
 ENTRY(page_fault)
        RING0_EC_FRAME
        ASM_CLAC
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 1b69951..5136404 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1295,6 +1295,17 @@ ENTRY(\sym)
 END(\sym)
 .endm
 
+#ifdef CONFIG_TRACING
+.macro trace_errorentry sym do_sym
+errorentry trace(\sym) trace(\do_sym)
+errorentry \sym \do_sym
+.endm
+#else
+.macro trace_errorentry sym do_sym
+errorentry \sym \do_sym
+.endm
+#endif
+
        /* error code is on the stack already */
 .macro paranoiderrorentry sym do_sym
 ENTRY(\sym)
@@ -1497,7 +1508,7 @@ zeroentry xen_int3 do_int3
 errorentry xen_stack_segment do_stack_segment
 #endif
 errorentry general_protection do_general_protection
-errorentry page_fault do_page_fault
+trace_errorentry page_fault do_page_fault
 #ifdef CONFIG_KVM_GUEST
 errorentry async_page_fault do_async_page_fault
 #endif
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 1be8e43..aebb2bf 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -162,7 +162,7 @@ asmlinkage void __init x86_64_start_kernel(char * 
real_mode_data)
        clear_bss();
 
        for (i = 0; i < NUM_EXCEPTION_VECTORS; i++)
-               set_intr_gate(i, &early_idt_handlers[i]);
+               set_intr_gate_raw(i, &early_idt_handlers[i]);
        load_idt((const struct desc_ptr *)&idt_descr);
 
        copy_bootdata(__va(real_mode_data));
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index a2a1fbc..2ca2354 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -206,7 +206,7 @@ void __init native_init_IRQ(void)
        i = FIRST_EXTERNAL_VECTOR;
        for_each_clear_bit_from(i, used_vectors, NR_VECTORS) {
                /* IA32_SYSCALL_VECTOR could be used in trap_init already. */
-               set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]);
+               set_intr_gate_raw(i, interrupt[i - FIRST_EXTERNAL_VECTOR]);
        }
 
        if (!acpi_ioapic && !of_ioapic)
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 697b93a..ba202ee 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -464,7 +464,7 @@ static struct notifier_block kvm_cpu_notifier = {
 
 static void __init kvm_apf_trap_init(void)
 {
-       set_intr_gate(14, &async_page_fault);
+       set_intr_gate(14, async_page_fault);
 }
 
 void __init kvm_guest_init(void)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 8c8093b..1c9d0ad 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -713,7 +713,7 @@ void __init early_trap_init(void)
        /* int3 can be called from all */
        set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK);
 #ifdef CONFIG_X86_32
-       set_intr_gate(X86_TRAP_PF, &page_fault);
+       set_intr_gate(X86_TRAP_PF, page_fault);
 #endif
        load_idt(&idt_descr);
 }
@@ -721,7 +721,7 @@ void __init early_trap_init(void)
 void __init early_trap_pf_init(void)
 {
 #ifdef CONFIG_X86_64
-       set_intr_gate(X86_TRAP_PF, &page_fault);
+       set_intr_gate(X86_TRAP_PF, page_fault);
 #endif
 }
 
@@ -737,30 +737,30 @@ void __init trap_init(void)
        early_iounmap(p, 4);
 #endif
 
-       set_intr_gate(X86_TRAP_DE, &divide_error);
+       set_intr_gate(X86_TRAP_DE, divide_error);
        set_intr_gate_ist(X86_TRAP_NMI, &nmi, NMI_STACK);
        /* int4 can be called from all */
        set_system_intr_gate(X86_TRAP_OF, &overflow);
-       set_intr_gate(X86_TRAP_BR, &bounds);
-       set_intr_gate(X86_TRAP_UD, &invalid_op);
-       set_intr_gate(X86_TRAP_NM, &device_not_available);
+       set_intr_gate(X86_TRAP_BR, bounds);
+       set_intr_gate(X86_TRAP_UD, invalid_op);
+       set_intr_gate(X86_TRAP_NM, device_not_available);
 #ifdef CONFIG_X86_32
        set_task_gate(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS);
 #else
        set_intr_gate_ist(X86_TRAP_DF, &double_fault, DOUBLEFAULT_STACK);
 #endif
-       set_intr_gate(X86_TRAP_OLD_MF, &coprocessor_segment_overrun);
-       set_intr_gate(X86_TRAP_TS, &invalid_TSS);
-       set_intr_gate(X86_TRAP_NP, &segment_not_present);
+       set_intr_gate(X86_TRAP_OLD_MF, coprocessor_segment_overrun);
+       set_intr_gate(X86_TRAP_TS, invalid_TSS);
+       set_intr_gate(X86_TRAP_NP, segment_not_present);
        set_intr_gate_ist(X86_TRAP_SS, &stack_segment, STACKFAULT_STACK);
-       set_intr_gate(X86_TRAP_GP, &general_protection);
-       set_intr_gate(X86_TRAP_SPURIOUS, &spurious_interrupt_bug);
-       set_intr_gate(X86_TRAP_MF, &coprocessor_error);
-       set_intr_gate(X86_TRAP_AC, &alignment_check);
+       set_intr_gate(X86_TRAP_GP, general_protection);
+       set_intr_gate(X86_TRAP_SPURIOUS, spurious_interrupt_bug);
+       set_intr_gate(X86_TRAP_MF, coprocessor_error);
+       set_intr_gate(X86_TRAP_AC, alignment_check);
 #ifdef CONFIG_X86_MCE
        set_intr_gate_ist(X86_TRAP_MC, &machine_check, MCE_STACK);
 #endif
-       set_intr_gate(X86_TRAP_XF, &simd_coprocessor_error);
+       set_intr_gate(X86_TRAP_XF, simd_coprocessor_error);
 
        /* Reserve all the builtin and the syscall vector: */
        for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 23d8e5f..6a19ad9 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -6,6 +6,8 @@ nostackp := $(call cc-option, -fno-stack-protector)
 CFLAGS_physaddr.o              := $(nostackp)
 CFLAGS_setup_nx.o              := $(nostackp)
 
+CFLAGS_fault.o := -I$(src)/../include/asm/trace
+
 obj-$(CONFIG_X86_PAT)          += pat_rbtree.o
 obj-$(CONFIG_SMP)              += tlb.o
 
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 654be4a..f515154 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -20,6 +20,9 @@
 #include <asm/kmemcheck.h>             /* kmemcheck_*(), ...           */
 #include <asm/fixmap.h>                        /* VSYSCALL_START               
*/
 
+#define CREATE_TRACE_POINTS
+#include <asm/trace/exceptions.h>
+
 /*
  * Page fault error code bits:
  *
@@ -1230,3 +1233,22 @@ do_page_fault(struct pt_regs *regs, unsigned long 
error_code)
        __do_page_fault(regs, error_code);
        exception_exit(prev_state);
 }
+
+static void trace_page_fault_entries(struct pt_regs *regs,
+                                    unsigned long error_code)
+{
+       if (user_mode(regs))
+               trace_user_page_fault(read_cr2(), regs, error_code);
+       else
+               trace_kernel_page_fault(read_cr2(), regs, error_code);
+}
+
+dotraplinkage void __kprobes
+trace_do_page_fault(struct pt_regs *regs, unsigned long error_code)
+{
+       enum ctx_state prev_state;
+       prev_state = exception_enter();
+       trace_page_fault_entries(regs, error_code);
+       __do_page_fault(regs, error_code);
+       exception_exit(prev_state);
+}
-- 
1.8.2.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to