Hi Josh and Daniel,

On Wed, 10 Mar 2021 09:08:45 -0600
Josh Poimboeuf <jpoim...@redhat.com> wrote:

> On Wed, Mar 10, 2021 at 06:57:34PM +0900, Masami Hiramatsu wrote:
> > > If I understand correctly, for #1 you need an unwind hint which treats
> > > the instruction *after* the "pushq %rsp" as the beginning of the
> > > function.
> > 
> > Thanks for the patch. In that case, should I still change the stack 
> > allocation?
> > Or can I continue to use a series of "push/pop" ?
> 
> You can continue to use push/pop.  Objtool is only getting confused by
> the unbalanced stack of the function (more pushes than pops).  The
> unwind hint should fix that.

With you patch, I made a fix for ORC unwinder. I confirmed it works with
2 multiple kretprobes on the call path like this ;

cd /sys/kernel/debug/tracing/
echo r vfs_read >> kprobe_events
echo r full_proxy_read >> kprobe_events
echo traceoff:1 > events/kprobes/r_vfs_read_0/trigger
echo stacktrace:1 > events/kprobes/r_full_proxy_read_0/trigger
echo 1 > events/kprobes/enable
echo 1 > options/sym-offset
cat /sys/kernel/debug/kprobes/list
echo 0 > events/kprobes/enable
cat trace

# tracer: nop
#
# entries-in-buffer/entries-written: 3/3   #P:8
#
#                                _-----=> irqs-off
#                               / _----=> need-resched
#                              | / _---=> hardirq/softirq
#                              || / _--=> preempt-depth
#                              ||| /     delay
#           TASK-PID     CPU#  ||||   TIMESTAMP  FUNCTION
#              | |         |   ||||      |         |
           <...>-136     [004] ...1   648.481281: r_full_proxy_read_0: 
(vfs_read+0xab/0x1a0 <- full_proxy_read)
           <...>-136     [004] ...1   648.481310: <stack trace>
 => kretprobe_trace_func+0x209/0x2f0
 => kretprobe_dispatcher+0x4a/0x70
 => __kretprobe_trampoline_handler+0xcd/0x170
 => trampoline_handler+0x3d/0x50
 => kretprobe_trampoline+0x25/0x50
 => vfs_read+0xab/0x1a0
 => ksys_read+0x5f/0xe0
 => do_syscall_64+0x33/0x40
 => entry_SYSCALL_64_after_hwframe+0x44/0xae
 => 0
 => 0
 => 0
 => 0
 => 0
 => 0
 => 0

I didn't tested it with bpftrace, but this also handles 
regs->ip == kretprobe_trampoline case. So it should work.

commit aa452d999b524b1851f69cc947be3e1a2f3ca1ec
Author: Masami Hiramatsu <mhira...@kernel.org>
Date:   Sat Mar 6 08:34:51 2021 +0900

    x86/unwind/orc: Fixup kretprobe trampoline entry
    
    Since the kretprobe replaces the function return address with
    the kretprobe_trampoline on the stack, the ORC unwinder can not
    continue the stack unwinding at that point.
    
    To fix this issue, correct state->ip as like as function-graph
    tracer in the unwind_next_frame().
    
    Signed-off-by: Masami Hiramatsu <mhira...@kernel.org>

diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h
index 70fc159ebe69..ab5e45b848d5 100644
--- a/arch/x86/include/asm/unwind.h
+++ b/arch/x86/include/asm/unwind.h
@@ -4,6 +4,7 @@
 
 #include <linux/sched.h>
 #include <linux/ftrace.h>
+#include <linux/llist.h>
 #include <asm/ptrace.h>
 #include <asm/stacktrace.h>
 
@@ -20,6 +21,9 @@ struct unwind_state {
        bool signal, full_regs;
        unsigned long sp, bp, ip;
        struct pt_regs *regs, *prev_regs;
+#if defined(CONFIG_KRETPROBES)
+       struct llist_node *kr_iter;
+#endif
 #elif defined(CONFIG_UNWINDER_FRAME_POINTER)
        bool got_irq;
        unsigned long *bp, *orig_sp, ip;
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
index 2a1d47f47eee..94869516cfc0 100644
--- a/arch/x86/kernel/unwind_orc.c
+++ b/arch/x86/kernel/unwind_orc.c
@@ -2,6 +2,7 @@
 #include <linux/objtool.h>
 #include <linux/module.h>
 #include <linux/sort.h>
+#include <linux/kprobes.h>
 #include <asm/ptrace.h>
 #include <asm/stacktrace.h>
 #include <asm/unwind.h>
@@ -414,6 +415,30 @@ static bool get_reg(struct unwind_state *state, unsigned 
int reg_off,
        return false;
 }
 
+#ifdef CONFIG_KRETPROBES
+static unsigned long orc_kretprobe_correct_ip(struct unwind_state *state)
+{
+       return kretprobe_find_ret_addr(
+                       (unsigned long)kretprobe_trampoline_addr(),
+                       state->task, &state->kr_iter);
+}
+
+static bool is_kretprobe_trampoline_address(unsigned long ip)
+{
+       return ip == (unsigned long)kretprobe_trampoline_addr();
+}
+#else
+static unsigned long orc_kretprobe_correct_ip(struct unwind_state *state)
+{
+       return state->ip;
+}
+
+static bool is_kretprobe_trampoline_address(unsigned long ip)
+{
+       return false;
+}
+#endif
+
 bool unwind_next_frame(struct unwind_state *state)
 {
        unsigned long ip_p, sp, tmp, orig_ip = state->ip, prev_sp = state->sp;
@@ -536,6 +561,18 @@ bool unwind_next_frame(struct unwind_state *state)
 
                state->ip = ftrace_graph_ret_addr(state->task, 
&state->graph_idx,
                                                  state->ip, (void *)ip_p);
+               /*
+                * There are special cases when the stack unwinder is called
+                * from the kretprobe handler or the interrupt handler which
+                * occurs in the kretprobe trampoline code. In those cases,
+                * %sp is shown on the stack instead of the return address.
+                * Or, when the unwinder find the return address is replaced
+                * by kretprobe_trampoline.
+                * In those cases, correct address can be found in kretprobe.
+                */
+               if (state->ip == sp ||
+                   is_kretprobe_trampoline_address(state->ip))
+                       state->ip = orc_kretprobe_correct_ip(state);
 
                state->sp = sp;
                state->regs = NULL;
@@ -649,6 +686,12 @@ void __unwind_start(struct unwind_state *state, struct 
task_struct *task,
                state->full_regs = true;
                state->signal = true;
 
+               /*
+                * When the unwinder called with regs from kretprobe handler,
+                * the regs->ip starts from kretprobe_trampoline address.
+                */
+               if (is_kretprobe_trampoline_address(state->ip))
+                       state->ip = orc_kretprobe_correct_ip(state);
        } else if (task == current) {
                asm volatile("lea (%%rip), %0\n\t"
                             "mov %%rsp, %1\n\t"
-- 
Masami Hiramatsu <mhira...@kernel.org>

Reply via email to