With frame pointers, when a task is interrupted, its stack is no longer
completely reliable because the function could have been interrupted
before it had a chance to save the previous frame pointer on the stack.
So the caller of the interrupted function could get skipped by a stack
trace.

This is problematic for live patching, which needs to know whether a
stack trace of a sleeping task can be relied upon.  There's currently no
way to detect if a sleeping task was interrupted by a page fault
exception or preemption before it went to sleep.

Another issue is that when dumping the stack of an interrupted task, the
unwinder has no way of knowing where the saved pt_regs registers are, so
it can't print them.

This solves those issues by encoding the pt_regs pointer in the frame
pointer on entry from an interrupt or an exception.

This patch also updates the unwinder to be able to decode it, because
otherwise the unwinder would be broken by this change.

Note that this causes a change in the behavior of the unwinder: each
instance of a pt_regs on the stack is now considered a "frame".  So
callers of unwind_get_return_address() will now get an occasional
'regs->ip' address that would have previously been skipped over.

Suggested-by: Andy Lutomirski <[email protected]>
Signed-off-by: Josh Poimboeuf <[email protected]>
---
 arch/x86/entry/calling.h       | 21 +++++++++++
 arch/x86/entry/entry_32.S      | 40 ++++++++++++++++++---
 arch/x86/entry/entry_64.S      | 10 ++++--
 arch/x86/include/asm/unwind.h  | 18 ++++++++--
 arch/x86/kernel/unwind_frame.c | 82 +++++++++++++++++++++++++++++++++++++-----
 5 files changed, 153 insertions(+), 18 deletions(-)

diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 9a9e588..ab799a3 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -201,6 +201,27 @@ For 32-bit we have the following conventions - kernel is 
built with
        .byte 0xf1
        .endm
 
+       /*
+        * This is a sneaky trick to help the unwinder find pt_regs on the
+        * stack.  The frame pointer is replaced with an encoded pointer to
+        * pt_regs.  The encoding is just a clearing of the highest-order bit,
+        * which makes it an invalid address and is also a signal to the
+        * unwinder that it's a pt_regs pointer in disguise.
+        *
+        * NOTE: This macro must be used *after* SAVE_EXTRA_REGS because it
+        * corrupts the original rbp.
+        */
+.macro ENCODE_FRAME_POINTER ptregs_offset=0
+#ifdef CONFIG_FRAME_POINTER
+       .if \ptregs_offset
+               leaq \ptregs_offset(%rsp), %rbp
+       .else
+               mov %rsp, %rbp
+       .endif
+       btr $63, %rbp
+#endif
+.endm
+
 #endif /* CONFIG_X86_64 */
 
 /*
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 4396278..4006fa3 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -174,6 +174,23 @@
        SET_KERNEL_GS %edx
 .endm
 
+/*
+ * This is a sneaky trick to help the unwinder find pt_regs on the
+ * stack.  The frame pointer is replaced with an encoded pointer to
+ * pt_regs.  The encoding is just a clearing of the highest-order bit,
+ * which makes it an invalid address and is also a signal to the
+ * unwinder that it's a pt_regs pointer in disguise.
+ *
+ * NOTE: This macro must be used *after* SAVE_ALL because it corrupts the
+ * original rbp.
+ */
+.macro ENCODE_FRAME_POINTER
+#ifdef CONFIG_FRAME_POINTER
+       mov %esp, %ebp
+       btr $31, %ebp
+#endif
+.endm
+
 .macro RESTORE_INT_REGS
        popl    %ebx
        popl    %ecx
@@ -205,10 +222,16 @@
 .endm
 
 ENTRY(ret_from_fork)
+       call    1f
+1:     push    $0
+       mov     %esp, %ebp
+
        pushl   %eax
        call    schedule_tail
        popl    %eax
 
+       addl    $8, %esp
+
        /* When we fork, we trace the syscall return in the child, too. */
        movl    %esp, %eax
        call    syscall_return_slowpath
@@ -588,6 +611,7 @@ common_interrupt:
        ASM_CLAC
        addl    $-0x80, (%esp)                  /* Adjust vector into the 
[-256, -1] range */
        SAVE_ALL
+       ENCODE_FRAME_POINTER
        TRACE_IRQS_OFF
        movl    %esp, %eax
        call    do_IRQ
@@ -599,6 +623,7 @@ ENTRY(name)                         \
        ASM_CLAC;                       \
        pushl   $~(nr);                 \
        SAVE_ALL;                       \
+       ENCODE_FRAME_POINTER;           \
        TRACE_IRQS_OFF                  \
        movl    %esp, %eax;             \
        call    fn;                     \
@@ -733,6 +758,7 @@ END(spurious_interrupt_bug)
 ENTRY(xen_hypervisor_callback)
        pushl   $-1                             /* orig_ax = -1 => not a system 
call */
        SAVE_ALL
+       ENCODE_FRAME_POINTER
        TRACE_IRQS_OFF
 
        /*
@@ -787,6 +813,7 @@ ENTRY(xen_failsafe_callback)
        jmp     iret_exc
 5:     pushl   $-1                             /* orig_ax = -1 => not a system 
call */
        SAVE_ALL
+       ENCODE_FRAME_POINTER
        jmp     ret_from_exception
 
 .section .fixup, "ax"
@@ -1013,6 +1040,7 @@ common_exception:
        pushl   %edx
        pushl   %ecx
        pushl   %ebx
+       ENCODE_FRAME_POINTER
        cld
        movl    $(__KERNEL_PERCPU), %ecx
        movl    %ecx, %fs
@@ -1045,6 +1073,7 @@ ENTRY(debug)
        ASM_CLAC
        pushl   $-1                             # mark this as an int
        SAVE_ALL
+       ENCODE_FRAME_POINTER
        xorl    %edx, %edx                      # error code 0
        movl    %esp, %eax                      # pt_regs pointer
 
@@ -1060,11 +1089,11 @@ ENTRY(debug)
 
 .Ldebug_from_sysenter_stack:
        /* We're on the SYSENTER stack.  Switch off. */
-       movl    %esp, %ebp
+       movl    %esp, %ebx
        movl    PER_CPU_VAR(cpu_current_top_of_stack), %esp
        TRACE_IRQS_OFF
        call    do_debug
-       movl    %ebp, %esp
+       movl    %ebx, %esp
        jmp     ret_from_exception
 END(debug)
 
@@ -1087,6 +1116,7 @@ ENTRY(nmi)
 
        pushl   %eax                            # pt_regs->orig_ax
        SAVE_ALL
+       ENCODE_FRAME_POINTER
        xorl    %edx, %edx                      # zero error code
        movl    %esp, %eax                      # pt_regs pointer
 
@@ -1105,10 +1135,10 @@ ENTRY(nmi)
         * We're on the SYSENTER stack.  Switch off.  No one (not even debug)
         * is using the thread stack right now, so it's safe for us to use it.
         */
-       movl    %esp, %ebp
+       movl    %esp, %ebx
        movl    PER_CPU_VAR(cpu_current_top_of_stack), %esp
        call    do_nmi
-       movl    %ebp, %esp
+       movl    %ebx, %esp
        jmp     .Lrestore_all_notrace
 
 #ifdef CONFIG_X86_ESPFIX32
@@ -1125,6 +1155,7 @@ ENTRY(nmi)
        .endr
        pushl   %eax
        SAVE_ALL
+       ENCODE_FRAME_POINTER
        FIXUP_ESPFIX_STACK                      # %eax == %esp
        xorl    %edx, %edx                      # zero error code
        call    do_nmi
@@ -1138,6 +1169,7 @@ ENTRY(int3)
        ASM_CLAC
        pushl   $-1                             # mark this as an int
        SAVE_ALL
+       ENCODE_FRAME_POINTER
        TRACE_IRQS_OFF
        xorl    %edx, %edx                      # zero error code
        movl    %esp, %eax                      # pt_regs pointer
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index f6b40e5..6200318 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -434,6 +434,7 @@ END(irq_entries_start)
        ALLOC_PT_GPREGS_ON_STACK
        SAVE_C_REGS
        SAVE_EXTRA_REGS
+       ENCODE_FRAME_POINTER
 
        testb   $3, CS(%rsp)
        jz      1f
@@ -907,6 +908,7 @@ ENTRY(xen_failsafe_callback)
        ALLOC_PT_GPREGS_ON_STACK
        SAVE_C_REGS
        SAVE_EXTRA_REGS
+       ENCODE_FRAME_POINTER
        jmp     error_exit
 END(xen_failsafe_callback)
 
@@ -950,6 +952,7 @@ ENTRY(paranoid_entry)
        cld
        SAVE_C_REGS 8
        SAVE_EXTRA_REGS 8
+       ENCODE_FRAME_POINTER 8
        movl    $1, %ebx
        movl    $MSR_GS_BASE, %ecx
        rdmsr
@@ -997,6 +1000,7 @@ ENTRY(error_entry)
        cld
        SAVE_C_REGS 8
        SAVE_EXTRA_REGS 8
+       ENCODE_FRAME_POINTER 8
        xorl    %ebx, %ebx
        testb   $3, CS+8(%rsp)
        jz      .Lerror_kernelspace
@@ -1179,6 +1183,7 @@ ENTRY(nmi)
        pushq   %r13            /* pt_regs->r13 */
        pushq   %r14            /* pt_regs->r14 */
        pushq   %r15            /* pt_regs->r15 */
+       ENCODE_FRAME_POINTER
 
        /*
         * At this point we no longer need to worry about stack damage
@@ -1192,11 +1197,10 @@ ENTRY(nmi)
 
        /*
         * Return back to user mode.  We must *not* do the normal exit
-        * work, because we don't want to enable interrupts.  Fortunately,
-        * do_nmi doesn't modify pt_regs.
+        * work, because we don't want to enable interrupts.
         */
        SWAPGS
-       jmp     restore_c_regs_and_iret
+       jmp     restore_regs_and_iret
 
 .Lnmi_from_kernel:
        /*
diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h
index 6dcb44b..295dbd1 100644
--- a/arch/x86/include/asm/unwind.h
+++ b/arch/x86/include/asm/unwind.h
@@ -13,6 +13,7 @@ struct unwind_state {
        int graph_idx;
 #ifdef CONFIG_FRAME_POINTER
        unsigned long *bp;
+       struct pt_regs *regs;
 #else
        unsigned long *sp;
 #endif
@@ -32,7 +33,7 @@ unsigned long *unwind_get_return_address_ptr(struct 
unwind_state *state)
        if (state->stack_info.type == STACK_TYPE_UNKNOWN)
                return NULL;
 
-       return state->bp + 1;
+       return state->regs ? &state->regs->ip : state->bp + 1;
 }
 
 static inline unsigned long *unwind_get_stack_ptr(struct unwind_state *state)
@@ -40,7 +41,15 @@ static inline unsigned long *unwind_get_stack_ptr(struct 
unwind_state *state)
        if (state->stack_info.type == STACK_TYPE_UNKNOWN)
                return NULL;
 
-       return state->bp;
+       return state->regs ? (unsigned long *)state->regs : state->bp;
+}
+
+static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
+{
+       if (state->stack_info.type == STACK_TYPE_UNKNOWN)
+               return NULL;
+
+       return state->regs;
 }
 
 unsigned long unwind_get_return_address(struct unwind_state *state);
@@ -61,6 +70,11 @@ static inline unsigned long *unwind_get_stack_ptr(struct 
unwind_state *state)
        return state->sp;
 }
 
+static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
+{
+       return NULL;
+}
+
 static inline
 unsigned long unwind_get_return_address(struct unwind_state *state)
 {
diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
index 00ad526..e02acec 100644
--- a/arch/x86/kernel/unwind_frame.c
+++ b/arch/x86/kernel/unwind_frame.c
@@ -14,6 +14,9 @@ unsigned long unwind_get_return_address(struct unwind_state 
*state)
        if (state->stack_info.type == STACK_TYPE_UNKNOWN)
                return 0;
 
+       if (state->regs && user_mode(state->regs))
+               return 0;
+
        addr = ftrace_graph_ret_addr(state->task, &state->graph_idx, *addr_p,
                                     addr_p);
 
@@ -21,6 +24,24 @@ unsigned long unwind_get_return_address(struct unwind_state 
*state)
 }
 EXPORT_SYMBOL_GPL(unwind_get_return_address);
 
+/*
+ * This determines if the frame pointer actually contains an encoded pointer to
+ * pt_regs on the stack.  See ENCODE_FRAME_POINTER.
+ */
+static struct pt_regs *decode_frame_pointer(unsigned long *bp)
+{
+       unsigned long regs = (unsigned long)bp;
+
+       /* if the MSB is set, it's not an encoded pointer */
+       if (regs & (1UL << (BITS_PER_LONG - 1)))
+               return NULL;
+
+       /* decode it by setting the MSB */
+       regs |= 1UL << (BITS_PER_LONG - 1);
+
+       return (struct pt_regs *)regs;
+}
+
 static bool update_stack_state(struct unwind_state *state, void *addr,
                               size_t len)
 {
@@ -45,26 +66,59 @@ unknown:
 
 bool unwind_next_frame(struct unwind_state *state)
 {
-       unsigned long *next_bp;
+       struct pt_regs *regs;
+       unsigned long *next_bp, *next_sp;
+       size_t next_len;
 
        if (unwind_done(state))
                return false;
 
-       next_bp = (unsigned long *)*state->bp;
+       /* have we reached the end? */
+       if (state->regs && user_mode(state->regs))
+               goto the_end;
+
+       /* get the next frame pointer */
+       if (state->regs)
+               next_bp = (unsigned long *)state->regs->bp;
+       else
+               next_bp = (unsigned long *)*state->bp;
+
+       /* is the next frame pointer an encoded pointer to pt_regs? */
+       regs = decode_frame_pointer(next_bp);
+       if (regs) {
+               next_sp = (unsigned long *)regs;
+               next_len = sizeof(*regs);
+       } else {
+               next_sp = next_bp;
+               next_len = FRAME_HEADER_SIZE;
+       }
 
        /* make sure the next frame's data is accessible */
-       if (!update_stack_state(state, next_bp, FRAME_HEADER_SIZE))
+       if (!update_stack_state(state, next_sp, next_len))
                return false;
-
        /* move to the next frame */
-       state->bp = next_bp;
+       if (regs) {
+               state->regs = regs;
+               state->bp = NULL;
+       } else {
+               state->bp = next_bp;
+               state->regs = NULL;
+       }
+
        return true;
+
+the_end:
+       state->stack_info.type = STACK_TYPE_UNKNOWN;
+       return false;
 }
 EXPORT_SYMBOL_GPL(unwind_next_frame);
 
 void __unwind_start(struct unwind_state *state, struct task_struct *task,
                    struct pt_regs *regs, unsigned long *first_sp)
 {
+       unsigned long *bp, *sp;
+       size_t len;
+
        memset(state, 0, sizeof(*state));
        state->task = task;
 
@@ -73,12 +127,22 @@ void __unwind_start(struct unwind_state *state, struct 
task_struct *task,
                return;
 
        /* set up the first stack frame */
-       state->bp = get_frame_pointer(task, regs);
+       bp = get_frame_pointer(task, regs);
+       regs = decode_frame_pointer(bp);
+       if (regs) {
+               state->regs = regs;
+               sp = (unsigned long *)regs;
+               len = sizeof(*regs);
+       }
+       else {
+               state->bp = bp;
+               sp = bp;
+               len = FRAME_HEADER_SIZE;
+       }
 
        /* initialize stack info and make sure the frame data is accessible */
-       get_stack_info(state->bp, state->task, &state->stack_info,
-                      &state->stack_mask);
-       update_stack_state(state, state->bp, FRAME_HEADER_SIZE);
+       get_stack_info(sp, state->task, &state->stack_info, &state->stack_mask);
+       update_stack_state(state, sp, len);
 
        /*
         * The caller can optionally provide a stack pointer directly
-- 
2.7.4

Reply via email to