[PATCH v4] trace/kprobe: Display the actual notrace function when rejecting a probe
Trying to probe update_sd_lb_stats() using perf results in the below message in the kernel log: trace_kprobe: Could not probe notrace function _text This is because 'perf probe' specifies the kprobe location as an offset from '_text': $ sudo perf probe -D update_sd_lb_stats p:probe/update_sd_lb_stats _text+1830728 However, the error message is misleading and doesn't help convey the actual notrace function that is being probed. Fix this by looking up the actual function name that is being probed. With this fix, we now get the below message in the kernel log: trace_kprobe: Could not probe notrace function update_sd_lb_stats.constprop.0 Signed-off-by: Naveen N Rao --- v4: Use printk format specifier %ps with probe address to lookup the symbol, as suggested by Masami. kernel/trace/trace_kprobe.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 3d7a180a8427..0017404d6e8d 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -487,8 +487,8 @@ static int __register_trace_kprobe(struct trace_kprobe *tk) return -EINVAL; if (within_notrace_func(tk)) { - pr_warn("Could not probe notrace function %s\n", - trace_kprobe_symbol(tk)); + pr_warn("Could not probe notrace function %ps\n", + (void *)trace_kprobe_address(tk)); return -EINVAL; } base-commit: 4758560fa268cecfa1144f015aa9f2525d164b7e -- 2.43.0
Re: [PATCH v3] trace/kprobe: Display the actual notrace function when rejecting a probe
On Thu, Dec 14, 2023 at 08:02:10AM +0900, Masami Hiramatsu wrote: > On Wed, 13 Dec 2023 20:09:14 +0530 > Naveen N Rao wrote: > > > Trying to probe update_sd_lb_stats() using perf results in the below > > message in the kernel log: > > trace_kprobe: Could not probe notrace function _text > > > > This is because 'perf probe' specifies the kprobe location as an offset > > from '_text': > > $ sudo perf probe -D update_sd_lb_stats > > p:probe/update_sd_lb_stats _text+1830728 > > > > However, the error message is misleading and doesn't help convey the > > actual notrace function that is being probed. Fix this by looking up the > > actual function name that is being probed. With this fix, we now get the > > below message in the kernel log: > > trace_kprobe: Could not probe notrace function > > update_sd_lb_stats.constprop.0 > > > > Signed-off-by: Naveen N Rao > > --- > > v3: Remove tk parameter from within_notrace_func() as suggested by > > Masami > > > > kernel/trace/trace_kprobe.c | 11 ++- > > 1 file changed, 6 insertions(+), 5 deletions(-) > > > > diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c > > index 3d7a180a8427..dc36c6ed6131 100644 > > --- a/kernel/trace/trace_kprobe.c > > +++ b/kernel/trace/trace_kprobe.c > > @@ -449,9 +449,8 @@ static bool __within_notrace_func(unsigned long addr) > > return !ftrace_location_range(addr, addr + size - 1); > > } > > > > -static bool within_notrace_func(struct trace_kprobe *tk) > > +static bool within_notrace_func(unsigned long addr) > > { > > - unsigned long addr = trace_kprobe_address(tk); > > char symname[KSYM_NAME_LEN], *p; > > > > if (!__within_notrace_func(addr)) > > @@ -471,12 +470,14 @@ static bool within_notrace_func(struct trace_kprobe > > *tk) > > return true; > > } > > #else > > -#define within_notrace_func(tk)(false) > > +#define within_notrace_func(addr) (false) > > #endif > > > > /* Internal register function - just handle k*probes and flags */ > > static int __register_trace_kprobe(struct trace_kprobe *tk) > > { > > + unsigned long addr = trace_kprobe_address(tk); > > + char symname[KSYM_NAME_LEN]; > > int i, ret; > > > > ret = security_locked_down(LOCKDOWN_KPROBES); > > @@ -486,9 +487,9 @@ static int __register_trace_kprobe(struct trace_kprobe > > *tk) > > if (trace_kprobe_is_registered(tk)) > > return -EINVAL; > > > > - if (within_notrace_func(tk)) { > > + if (within_notrace_func(addr)) { > > pr_warn("Could not probe notrace function %s\n", > > - trace_kprobe_symbol(tk)); > > + lookup_symbol_name(addr, symname) ? > > trace_kprobe_symbol(tk) : symname); > > Can we just use %ps and (void *)trace_kprobe_address(tk) here? > > That will be simpler. Indeed - that is much simpler. v4 on its way... Thanks! - Naveen
[PATCH v3] trace/kprobe: Display the actual notrace function when rejecting a probe
Trying to probe update_sd_lb_stats() using perf results in the below message in the kernel log: trace_kprobe: Could not probe notrace function _text This is because 'perf probe' specifies the kprobe location as an offset from '_text': $ sudo perf probe -D update_sd_lb_stats p:probe/update_sd_lb_stats _text+1830728 However, the error message is misleading and doesn't help convey the actual notrace function that is being probed. Fix this by looking up the actual function name that is being probed. With this fix, we now get the below message in the kernel log: trace_kprobe: Could not probe notrace function update_sd_lb_stats.constprop.0 Signed-off-by: Naveen N Rao --- v3: Remove tk parameter from within_notrace_func() as suggested by Masami kernel/trace/trace_kprobe.c | 11 ++- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 3d7a180a8427..dc36c6ed6131 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -449,9 +449,8 @@ static bool __within_notrace_func(unsigned long addr) return !ftrace_location_range(addr, addr + size - 1); } -static bool within_notrace_func(struct trace_kprobe *tk) +static bool within_notrace_func(unsigned long addr) { - unsigned long addr = trace_kprobe_address(tk); char symname[KSYM_NAME_LEN], *p; if (!__within_notrace_func(addr)) @@ -471,12 +470,14 @@ static bool within_notrace_func(struct trace_kprobe *tk) return true; } #else -#define within_notrace_func(tk)(false) +#define within_notrace_func(addr) (false) #endif /* Internal register function - just handle k*probes and flags */ static int __register_trace_kprobe(struct trace_kprobe *tk) { + unsigned long addr = trace_kprobe_address(tk); + char symname[KSYM_NAME_LEN]; int i, ret; ret = security_locked_down(LOCKDOWN_KPROBES); @@ -486,9 +487,9 @@ static int __register_trace_kprobe(struct trace_kprobe *tk) if (trace_kprobe_is_registered(tk)) return -EINVAL; - if (within_notrace_func(tk)) { + if (within_notrace_func(addr)) { pr_warn("Could not probe notrace function %s\n", - trace_kprobe_symbol(tk)); + lookup_symbol_name(addr, symname) ? trace_kprobe_symbol(tk) : symname); return -EINVAL; } base-commit: 4758560fa268cecfa1144f015aa9f2525d164b7e -- 2.43.0
Re: [PATCH v1 2/2] powerpc: Enable OPTPROBES on PPC32
Christophe Leroy wrote: For that, create a 32 bits version of patch_imm64_load_insns() and create a patch_imm_load_insns() which calls patch_imm32_load_insns() on PPC32 and patch_imm64_load_insns() on PPC64. Adapt optprobes_head.S for PPC32. Use PPC_LL/PPC_STL macros instead of raw ld/std, opt out things linked to paca and use stmw/lmw to save/restore registers. Signed-off-by: Christophe Leroy --- arch/powerpc/Kconfig | 2 +- arch/powerpc/kernel/optprobes.c | 24 +-- arch/powerpc/kernel/optprobes_head.S | 46 +++- 3 files changed, 53 insertions(+), 19 deletions(-) Thanks for adding support for ppc32. It is good to see that it works without too many changes. diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index c1344c05226c..49b538e54efb 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -227,7 +227,7 @@ config PPC select HAVE_MOD_ARCH_SPECIFIC select HAVE_NMI if PERF_EVENTS || (PPC64 && PPC_BOOK3S) select HAVE_HARDLOCKUP_DETECTOR_ARCHif PPC64 && PPC_BOOK3S && SMP - select HAVE_OPTPROBES if PPC64 + select HAVE_OPTPROBES select HAVE_PERF_EVENTS select HAVE_PERF_EVENTS_NMI if PPC64 select HAVE_HARDLOCKUP_DETECTOR_PERFif PERF_EVENTS && HAVE_PERF_EVENTS_NMI && !HAVE_HARDLOCKUP_DETECTOR_ARCH diff --git a/arch/powerpc/kernel/optprobes.c b/arch/powerpc/kernel/optprobes.c index 58fdb9f66e0f..cdf87086fa33 100644 --- a/arch/powerpc/kernel/optprobes.c +++ b/arch/powerpc/kernel/optprobes.c @@ -141,11 +141,21 @@ void arch_remove_optimized_kprobe(struct optimized_kprobe *op) } } +static void patch_imm32_load_insns(unsigned long val, int reg, kprobe_opcode_t *addr) +{ + patch_instruction((struct ppc_inst *)addr, + ppc_inst(PPC_RAW_LIS(reg, IMM_H(val; + addr++; + + patch_instruction((struct ppc_inst *)addr, + ppc_inst(PPC_RAW_ORI(reg, reg, IMM_L(val; +} + /* * Generate instructions to load provided immediate 64-bit value * to register 'reg' and patch these instructions at 'addr'. */ -static void patch_imm64_load_insns(unsigned long val, int reg, kprobe_opcode_t *addr) +static void patch_imm64_load_insns(unsigned long long val, int reg, kprobe_opcode_t *addr) Do you really need this? { /* lis reg,(op)@highest */ patch_instruction((struct ppc_inst *)addr, @@ -177,6 +187,14 @@ static void patch_imm64_load_insns(unsigned long val, int reg, kprobe_opcode_t * ___PPC_RS(reg) | (val & 0x))); } +static void patch_imm_load_insns(unsigned long val, int reg, kprobe_opcode_t *addr) +{ + if (IS_ENABLED(CONFIG_PPC64)) + patch_imm64_load_insns(val, reg, addr); + else + patch_imm32_load_insns(val, reg, addr); +} + int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p) { struct ppc_inst branch_op_callback, branch_emulate_step, temp; @@ -230,7 +248,7 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p) * Fixup the template with instructions to: * 1. load the address of the actual probepoint */ - patch_imm64_load_insns((unsigned long)op, 3, buff + TMPL_OP_IDX); + patch_imm_load_insns((unsigned long)op, 3, buff + TMPL_OP_IDX); /* * 2. branch to optimized_callback() and emulate_step() @@ -264,7 +282,7 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p) * 3. load instruction to be emulated into relevant register, and */ temp = ppc_inst_read((struct ppc_inst *)p->ainsn.insn); - patch_imm64_load_insns(ppc_inst_as_ulong(temp), 4, buff + TMPL_INSN_IDX); + patch_imm_load_insns(ppc_inst_as_ulong(temp), 4, buff + TMPL_INSN_IDX); /* * 4. branch back from trampoline diff --git a/arch/powerpc/kernel/optprobes_head.S b/arch/powerpc/kernel/optprobes_head.S index ff8ba4d3824d..49f31e554573 100644 --- a/arch/powerpc/kernel/optprobes_head.S +++ b/arch/powerpc/kernel/optprobes_head.S @@ -30,39 +30,47 @@ optinsn_slot: .global optprobe_template_entry optprobe_template_entry: /* Create an in-memory pt_regs */ - stdur1,-INT_FRAME_SIZE(r1) + PPC_STLUr1,-INT_FRAME_SIZE(r1) SAVE_GPR(0,r1) /* Save the previous SP into stack */ addir0,r1,INT_FRAME_SIZE - std r0,GPR1(r1) + PPC_STL r0,GPR1(r1) +#ifdef CONFIG_PPC64 SAVE_10GPRS(2,r1) SAVE_10GPRS(12,r1) SAVE_10GPRS(22,r1) +#else + stmwr2, GPR2(r1) +#endif /* Save SPRS */ mfmsr r5 - std r5,_MSR(r1) + PPC_STL r5,_MSR(r1) li r5,0x700 - std r5,_TRAP(r1) + PPC_STL r5,_TRAP(r1) li r5,0 - std r5,ORIG_GPR3(r1)
Re: [PATCH v4] powerpc/uprobes: Validation for prefixed instruction
On 2021/03/09 08:54PM, Michael Ellerman wrote: > Ravi Bangoria writes: > > As per ISA 3.1, prefixed instruction should not cross 64-byte > > boundary. So don't allow Uprobe on such prefixed instruction. > > > > There are two ways probed instruction is changed in mapped pages. > > First, when Uprobe is activated, it searches for all the relevant > > pages and replace instruction in them. In this case, if that probe > > is on the 64-byte unaligned prefixed instruction, error out > > directly. Second, when Uprobe is already active and user maps a > > relevant page via mmap(), instruction is replaced via mmap() code > > path. But because Uprobe is invalid, entire mmap() operation can > > not be stopped. In this case just print an error and continue. > > > > Signed-off-by: Ravi Bangoria > > Acked-by: Naveen N. Rao > > Do we have a Fixes: tag for this? Since this is an additional check we are adding, I don't think we should add a Fixes: tag. Nothing is broken per-se -- we're just adding more checks to catch simple mistakes. Also, like Oleg pointed out, there are still many other ways for users to shoot themselves in the foot with uprobes and prefixed instructions, if they so desire. However, if you still think we should add a Fixes: tag, we can perhaps use the below commit since I didn't see any specific commit adding support for prefixed instructions for uprobes: Fixes: 650b55b707fdfa ("powerpc: Add prefixed instructions to instruction data type") > > > --- > > v3: > > https://lore.kernel.org/r/20210304050529.59391-1-ravi.bango...@linux.ibm.com > > v3->v4: > > - CONFIG_PPC64 check was not required, remove it. > > - Use SZ_ macros instead of hardcoded numbers. > > > > arch/powerpc/kernel/uprobes.c | 7 +++ > > 1 file changed, 7 insertions(+) > > > > diff --git a/arch/powerpc/kernel/uprobes.c b/arch/powerpc/kernel/uprobes.c > > index e8a63713e655..4cbfff6e94a3 100644 > > --- a/arch/powerpc/kernel/uprobes.c > > +++ b/arch/powerpc/kernel/uprobes.c > > @@ -41,6 +41,13 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, > > if (addr & 0x03) > > return -EINVAL; > > > > + if (cpu_has_feature(CPU_FTR_ARCH_31) && > > + ppc_inst_prefixed(auprobe->insn) && > > + (addr & (SZ_64 - 4)) == SZ_64 - 4) { > > + pr_info_ratelimited("Cannot register a uprobe on 64 byte > > unaligned prefixed instruction\n"); > > + return -EINVAL; > > I realise we already did the 0x03 check above, but I still think this > would be clearer simply as: > > (addr & 0x3f == 60) Indeed, I like the use of `60' there -- hex is overrated ;) - Naveen
Re: [PATCH v3] powerpc/uprobes: Validation for prefixed instruction
On 2021/03/04 10:35AM, Ravi Bangoria wrote: > As per ISA 3.1, prefixed instruction should not cross 64-byte > boundary. So don't allow Uprobe on such prefixed instruction. > > There are two ways probed instruction is changed in mapped pages. > First, when Uprobe is activated, it searches for all the relevant > pages and replace instruction in them. In this case, if that probe > is on the 64-byte unaligned prefixed instruction, error out > directly. Second, when Uprobe is already active and user maps a > relevant page via mmap(), instruction is replaced via mmap() code > path. But because Uprobe is invalid, entire mmap() operation can > not be stopped. In this case just print an error and continue. > > Signed-off-by: Ravi Bangoria > --- > v2: > https://lore.kernel.org/r/20210204104703.273429-1-ravi.bango...@linux.ibm.com > v2->v3: > - Drop restriction for Uprobe on suffix of prefixed instruction. > It needs lot of code change including generic code but what > we get in return is not worth it. > > arch/powerpc/kernel/uprobes.c | 8 > 1 file changed, 8 insertions(+) > > diff --git a/arch/powerpc/kernel/uprobes.c b/arch/powerpc/kernel/uprobes.c > index e8a63713e655..c400971ebe70 100644 > --- a/arch/powerpc/kernel/uprobes.c > +++ b/arch/powerpc/kernel/uprobes.c > @@ -41,6 +41,14 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, > if (addr & 0x03) > return -EINVAL; > > + if (!IS_ENABLED(CONFIG_PPC64) || !cpu_has_feature(CPU_FTR_ARCH_31)) > + return 0; Sorry, I missed this last time, but I think we can drop the above checks. ppc_inst_prefixed() already factors in the dependency on CONFIG_PPC64, and I don't think we need to confirm if we're running on a ISA V3.1 for the below check. With that: Acked-by: Naveen N. Rao > + > + if (ppc_inst_prefixed(auprobe->insn) && (addr & 0x3F) == 0x3C) { > + pr_info_ratelimited("Cannot register a uprobe on 64 byte > unaligned prefixed instruction\n"); > + return -EINVAL; > + } > + - Naveen
Re: [PATCH v2] powerpc/uprobes: Validation for prefixed instruction
On 2021/02/04 06:38PM, Naveen N. Rao wrote: > On 2021/02/04 04:17PM, Ravi Bangoria wrote: > > Don't allow Uprobe on 2nd word of a prefixed instruction. As per > > ISA 3.1, prefixed instruction should not cross 64-byte boundary. > > So don't allow Uprobe on such prefixed instruction as well. > > > > There are two ways probed instruction is changed in mapped pages. > > First, when Uprobe is activated, it searches for all the relevant > > pages and replace instruction in them. In this case, if we notice > > that probe is on the 2nd word of prefixed instruction, error out > > directly. Second, when Uprobe is already active and user maps a > > relevant page via mmap(), instruction is replaced via mmap() code > > path. But because Uprobe is invalid, entire mmap() operation can > > not be stopped. In this case just print an error and continue. > > > > Signed-off-by: Ravi Bangoria > > --- > > v1: > > http://lore.kernel.org/r/20210119091234.76317-1-ravi.bango...@linux.ibm.com > > v1->v2: > > - Instead of introducing new arch hook from verify_opcode(), use > > existing hook arch_uprobe_analyze_insn(). > > - Add explicit check for prefixed instruction crossing 64-byte > > boundary. If probe is on such instruction, throw an error. > > > > arch/powerpc/kernel/uprobes.c | 66 ++- > > 1 file changed, 65 insertions(+), 1 deletion(-) > > > > diff --git a/arch/powerpc/kernel/uprobes.c b/arch/powerpc/kernel/uprobes.c > > index e8a63713e655..485d19a2a31f 100644 > > --- a/arch/powerpc/kernel/uprobes.c > > +++ b/arch/powerpc/kernel/uprobes.c > > @@ -7,6 +7,7 @@ > > * Adapted from the x86 port by Ananth N Mavinakayanahalli > > > > */ > > #include > > +#include > > #include > > #include > > #include > > @@ -28,6 +29,69 @@ bool is_trap_insn(uprobe_opcode_t *insn) > > return (is_trap(*insn)); > > } > > > > +#ifdef CONFIG_PPC64 > > +static int get_instr(struct mm_struct *mm, unsigned long addr, u32 *instr) > > +{ > > + struct page *page; > > + struct vm_area_struct *vma; > > + void *kaddr; > > + unsigned int gup_flags = FOLL_FORCE | FOLL_SPLIT_PMD; > > + > > + if (get_user_pages_remote(mm, addr, 1, gup_flags, , , NULL) <= > > 0) > > + return -EINVAL; > > + > > + kaddr = kmap_atomic(page); > > + *instr = *((u32 *)(kaddr + (addr & ~PAGE_MASK))); > > + kunmap_atomic(kaddr); > > + put_page(page); > > + return 0; > > +} > > + > > +static int validate_prefixed_instr(struct mm_struct *mm, unsigned long > > addr) > > +{ > > + struct ppc_inst inst; > > + u32 prefix, suffix; > > + > > + /* > > +* No need to check if addr is pointing to beginning of the > > +* page. Even if probe is on a suffix of page-unaligned > > +* prefixed instruction, hw will raise exception and kernel > > +* will send SIGBUS. > > +*/ > > + if (!(addr & ~PAGE_MASK)) > > + return 0; > > + > > + if (get_instr(mm, addr, ) < 0) > > + return -EINVAL; > > + if (get_instr(mm, addr + 4, ) < 0) > > + return -EINVAL; > > + > > + inst = ppc_inst_prefix(prefix, suffix); > > + if (ppc_inst_prefixed(inst) && (addr & 0x3F) == 0x3C) { > > + printk_ratelimited("Cannot register a uprobe on 64 byte " > ^^ pr_info_ratelimited() > > It should be sufficient to check the primary opcode to determine if it > is a prefixed instruction. You don't have to read the suffix. I see that > we don't have a helper to do this currently, so you could do: > > if (ppc_inst_primary_opcode(ppc_inst(prefix)) == 1) Seeing the kprobes code, I realized that we have to check for another scenario (Thanks, Jordan!). If this is the suffix of a prefix instruction for which a uprobe has already been installed, then the previous word will be a 'trap' instruction. You need to check if there is a uprobe at the previous word, and if the original instruction there was a prefix instruction. - Naveen
Re: [PATCH v2] powerpc/uprobes: Validation for prefixed instruction
On 2021/02/04 04:19PM, Ravi Bangoria wrote: > > > On 2/4/21 4:17 PM, Ravi Bangoria wrote: > > Don't allow Uprobe on 2nd word of a prefixed instruction. As per > > ISA 3.1, prefixed instruction should not cross 64-byte boundary. > > So don't allow Uprobe on such prefixed instruction as well. > > > > There are two ways probed instruction is changed in mapped pages. > > First, when Uprobe is activated, it searches for all the relevant > > pages and replace instruction in them. In this case, if we notice > > that probe is on the 2nd word of prefixed instruction, error out > > directly. Second, when Uprobe is already active and user maps a > > relevant page via mmap(), instruction is replaced via mmap() code > > path. But because Uprobe is invalid, entire mmap() operation can > > not be stopped. In this case just print an error and continue. > > @mpe, > > arch_uprobe_analyze_insn() can return early if > cpu_has_feature(CPU_FTR_ARCH_31) is not set. But that will > miss out a rare scenario of user running binary with prefixed > instruction on p10 predecessors. Please let me know if I > should add cpu_has_feature(CPU_FTR_ARCH_31) or not. The check you are adding is very specific to prefixed instructions, so it makes sense to add a cpu feature check for v3.1. On older processors, those are invalid instructions like any other. The instruction emulation infrastructure will refuse to emulate it and the instruction will be single stepped. - Naveen
Re: [PATCH v2] powerpc/uprobes: Validation for prefixed instruction
On 2021/02/04 04:17PM, Ravi Bangoria wrote: > Don't allow Uprobe on 2nd word of a prefixed instruction. As per > ISA 3.1, prefixed instruction should not cross 64-byte boundary. > So don't allow Uprobe on such prefixed instruction as well. > > There are two ways probed instruction is changed in mapped pages. > First, when Uprobe is activated, it searches for all the relevant > pages and replace instruction in them. In this case, if we notice > that probe is on the 2nd word of prefixed instruction, error out > directly. Second, when Uprobe is already active and user maps a > relevant page via mmap(), instruction is replaced via mmap() code > path. But because Uprobe is invalid, entire mmap() operation can > not be stopped. In this case just print an error and continue. > > Signed-off-by: Ravi Bangoria > --- > v1: > http://lore.kernel.org/r/20210119091234.76317-1-ravi.bango...@linux.ibm.com > v1->v2: > - Instead of introducing new arch hook from verify_opcode(), use > existing hook arch_uprobe_analyze_insn(). > - Add explicit check for prefixed instruction crossing 64-byte > boundary. If probe is on such instruction, throw an error. > > arch/powerpc/kernel/uprobes.c | 66 ++- > 1 file changed, 65 insertions(+), 1 deletion(-) > > diff --git a/arch/powerpc/kernel/uprobes.c b/arch/powerpc/kernel/uprobes.c > index e8a63713e655..485d19a2a31f 100644 > --- a/arch/powerpc/kernel/uprobes.c > +++ b/arch/powerpc/kernel/uprobes.c > @@ -7,6 +7,7 @@ > * Adapted from the x86 port by Ananth N Mavinakayanahalli > > */ > #include > +#include > #include > #include > #include > @@ -28,6 +29,69 @@ bool is_trap_insn(uprobe_opcode_t *insn) > return (is_trap(*insn)); > } > > +#ifdef CONFIG_PPC64 > +static int get_instr(struct mm_struct *mm, unsigned long addr, u32 *instr) > +{ > + struct page *page; > + struct vm_area_struct *vma; > + void *kaddr; > + unsigned int gup_flags = FOLL_FORCE | FOLL_SPLIT_PMD; > + > + if (get_user_pages_remote(mm, addr, 1, gup_flags, , , NULL) <= > 0) > + return -EINVAL; > + > + kaddr = kmap_atomic(page); > + *instr = *((u32 *)(kaddr + (addr & ~PAGE_MASK))); > + kunmap_atomic(kaddr); > + put_page(page); > + return 0; > +} > + > +static int validate_prefixed_instr(struct mm_struct *mm, unsigned long addr) > +{ > + struct ppc_inst inst; > + u32 prefix, suffix; > + > + /* > + * No need to check if addr is pointing to beginning of the > + * page. Even if probe is on a suffix of page-unaligned > + * prefixed instruction, hw will raise exception and kernel > + * will send SIGBUS. > + */ > + if (!(addr & ~PAGE_MASK)) > + return 0; > + > + if (get_instr(mm, addr, ) < 0) > + return -EINVAL; > + if (get_instr(mm, addr + 4, ) < 0) > + return -EINVAL; > + > + inst = ppc_inst_prefix(prefix, suffix); > + if (ppc_inst_prefixed(inst) && (addr & 0x3F) == 0x3C) { > + printk_ratelimited("Cannot register a uprobe on 64 byte " ^^ pr_info_ratelimited() It should be sufficient to check the primary opcode to determine if it is a prefixed instruction. You don't have to read the suffix. I see that we don't have a helper to do this currently, so you could do: if (ppc_inst_primary_opcode(ppc_inst(prefix)) == 1) In the future, it might be worthwhile to add IS_PREFIX() as a macro similar to IS_MTMSRD() if there are more such uses. Along with this, if you also add the below to the start of this function, you can get rid of the #ifdef: if (!IS_ENABLED(CONFIG_PPC64)) return 0; - Naveen
Re: [PATCH] kprobes: Warn if the kprobe is reregistered
On 2021/02/03 11:59PM, Masami Hiramatsu wrote: > Warn if the kprobe is reregistered, since there must be > a software bug (actively used resource must not be re-registered) > and caller must be fixed. > > Signed-off-by: Masami Hiramatsu > --- > kernel/kprobes.c | 13 - > 1 file changed, 8 insertions(+), 5 deletions(-) Suggested-by: Naveen N. Rao Acked-by: Naveen N. Rao Thanks, Naveen
Re: [PATCH] selftests/ftrace: Disable multiple_kprobes test on powerpc
Masami Hiramatsu wrote: On Tue, 5 Jan 2021 19:01:56 +0900 Masami Hiramatsu wrote: On Tue, 5 Jan 2021 12:27:30 +0530 "Naveen N. Rao" wrote: > Not all symbols are blacklisted on powerpc. Disable multiple_kprobes > test until that is sorted, so that rest of ftrace and kprobe selftests > can be run. This looks good to me, but could you try to find the functions which should be blocked from kprobes? (Usually, the function which are involved in the sw-breakpoint handling, including locks etc.) Ah, OK. I wonder why CONFIG_KPROBE_EVENTS_ON_NOTRACE=n doesn't help, it was ignored if the arch doesn't support CONFIG_KPROBES_ON_FTRACE. Good point, though we do support CONFIG_KPROBES_ON_FTRACE on powerpc so the below patch is unlikely to help. However, since entry code is unlikely to be the source of the issue due to CONFIG_KPROBE_EVENTS_ON_NOTRACE, I will take another look to see where the problem lies. Naveen, could you try to run this test case with following patch on powerpc? diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index b911e9f6d9f5..241a55313476 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -433,7 +433,7 @@ static int disable_trace_kprobe(struct trace_event_call *call, return 0; } -#if defined(CONFIG_KPROBES_ON_FTRACE) && \ +#if defined(CONFIG_FUNCTION_TRACER) && \ !defined(CONFIG_KPROBE_EVENTS_ON_NOTRACE) static bool __within_notrace_func(unsigned long addr) { This looks like a good change regardless, so if you intend to post this separately: Acked-by: Naveen N. Rao Thanks, Naveen
Re: [PATCH] selftests/ftrace: Disable multiple_kprobes test on powerpc
Masami Hiramatsu wrote: On Tue, 5 Jan 2021 12:27:30 +0530 "Naveen N. Rao" wrote: Not all symbols are blacklisted on powerpc. Disable multiple_kprobes test until that is sorted, so that rest of ftrace and kprobe selftests can be run. This looks good to me, but could you try to find the functions which should be blocked from kprobes? (Usually, the function which are involved in the sw-breakpoint handling, including locks etc.) Yes, we did add several blacklists some time back, but there has been quite a bit of churn in our entry code. I've been meaning to audit it for a while now, but this has been blocking tests. It would be nice to skip this test for now until I am able to spend some time on this. Thanks, Naveen
[PATCH] selftests/ftrace: Disable multiple_kprobes test on powerpc
Not all symbols are blacklisted on powerpc. Disable multiple_kprobes test until that is sorted, so that rest of ftrace and kprobe selftests can be run. Signed-off-by: Naveen N. Rao --- .../testing/selftests/ftrace/test.d/kprobe/multiple_kprobes.tc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/multiple_kprobes.tc b/tools/testing/selftests/ftrace/test.d/kprobe/multiple_kprobes.tc index 312d237800969e..41503c32f53eed 100644 --- a/tools/testing/selftests/ftrace/test.d/kprobe/multiple_kprobes.tc +++ b/tools/testing/selftests/ftrace/test.d/kprobe/multiple_kprobes.tc @@ -7,7 +7,7 @@ # Currently HAVE_KPROBES_ON_FTRACE defined on x86 and powerpc64le case `uname -m` in x86_64|i[3456]86) OFFS=5;; - ppc64le) OFFS=8;; + ppc*) exit_unsupported;; *) OFFS=0;; esac base-commit: 36bbbd0e234d817938bdc52121a0f5473b3e58f5 -- 2.25.4
Re: [PATCH 3/4] perf tools: Update powerpc's syscall.tbl
Arnaldo Carvalho de Melo wrote: Em Fri, Dec 18, 2020 at 08:08:56PM +0530, Naveen N. Rao escreveu: Hi Arnaldo, Arnaldo Carvalho de Melo wrote: > Em Fri, Dec 18, 2020 at 08:26:59AM -0300, Arnaldo Carvalho de Melo escreveu: > > Em Fri, Dec 18, 2020 at 03:59:23PM +0800, Tiezhu Yang escreveu: > > > This silences the following tools/perf/ build warning: > > > Warning: Kernel ABI header at 'tools/perf/arch/powerpc/entry/syscalls/syscall.tbl' differs from latest version at 'arch/powerpc/kernel/syscalls/syscall.tbl' > > > > Hi Ravi, Naveen, > > > > Can I get your Reviewed-by or Acked-by for this change and the > > other that adds s390's syscall.tbl to check_headers.sh so that we get > > oops s/s390/powerpc/g :-) > > > notified when the copy drifts, so that we can see if it still continues > > working and we can get new syscalls to be supported in things like 'perf > > trace'? Yes, this looks good to me: Reviewed-by: Naveen N. Rao FWIW, I had posted a similar patch back in April, but glad to have this go in ;) http://lkml.kernel.org/r/20200220063740.785913-1-naveen.n@linux.vnet.ibm.com My bad :-\ No worries, thanks for checking on this one. - Naveen
Re: [PATCH 3/4] perf tools: Update powerpc's syscall.tbl
Hi Arnaldo, Arnaldo Carvalho de Melo wrote: Em Fri, Dec 18, 2020 at 08:26:59AM -0300, Arnaldo Carvalho de Melo escreveu: Em Fri, Dec 18, 2020 at 03:59:23PM +0800, Tiezhu Yang escreveu: > This silences the following tools/perf/ build warning: > Warning: Kernel ABI header at 'tools/perf/arch/powerpc/entry/syscalls/syscall.tbl' differs from latest version at 'arch/powerpc/kernel/syscalls/syscall.tbl' Hi Ravi, Naveen, Can I get your Reviewed-by or Acked-by for this change and the other that adds s390's syscall.tbl to check_headers.sh so that we get oops s/s390/powerpc/g :-) notified when the copy drifts, so that we can see if it still continues working and we can get new syscalls to be supported in things like 'perf trace'? Yes, this looks good to me: Reviewed-by: Naveen N. Rao FWIW, I had posted a similar patch back in April, but glad to have this go in ;) http://lkml.kernel.org/r/20200220063740.785913-1-naveen.n@linux.vnet.ibm.com Thanks, Naveen
Re: [RFC PATCH 01/14] ftrace: Fix updating FTRACE_FL_TRAMP
Steven Rostedt wrote: On Thu, 26 Nov 2020 23:38:38 +0530 "Naveen N. Rao" wrote: On powerpc, kprobe-direct.tc triggered FTRACE_WARN_ON() in ftrace_get_addr_new() followed by the below message: Bad trampoline accounting at: 4222522f (wake_up_process+0xc/0x20) (f001) The set of steps leading to this involved: - modprobe ftrace-direct-too - enable_probe - modprobe ftrace-direct - rmmod ftrace-direct <-- trigger The problem turned out to be that we were not updating flags in the ftrace record properly. From the above message about the trampoline accounting being bad, it can be seen that the ftrace record still has FTRACE_FL_TRAMP set though ftrace-direct module is going away. This happens because we are checking if any ftrace_ops has the FTRACE_FL_TRAMP flag set _before_ updating the filter hash. The fix for this is to look for any _other_ ftrace_ops that also needs FTRACE_FL_TRAMP. I'm applying this now and sending this for -rc and stable. The code worked on x86 because x86 has a way to make all users use trampolines, so this was never an issue (everything has a trampoline). I modified the kernel so that x86 would not create its own trampoline (see the weak function arch_ftrace_update_trampoline(), and I was able to reproduce the bug. Good to know that you were able to reproduce this. I'm adding: Cc: sta...@vger.kernel.org Fixes: a124692b698b0 ("ftrace: Enable trampoline when rec count returns back to one") That looks good to me. Thanks for picking the two patches and for your review on the others! - Naveen
[RFC PATCH 09/14] powerpc/ftrace: Use a hash table for tracking ftrace stubs
In preparation for having to deal with large number of ftrace stubs in support of ftrace direct calls, convert existing stubs to use a hash table. The hash table is key'ed off the target address for the stubs since there could be multiple stubs for the same target to cover the full kernel text. Signed-off-by: Naveen N. Rao --- arch/powerpc/kernel/trace/ftrace.c | 75 +- 1 file changed, 33 insertions(+), 42 deletions(-) diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c index 14b39f7797d455..7ddb6e4b527c39 100644 --- a/arch/powerpc/kernel/trace/ftrace.c +++ b/arch/powerpc/kernel/trace/ftrace.c @@ -13,6 +13,7 @@ #define pr_fmt(fmt) "ftrace-powerpc: " fmt +#include #include #include #include @@ -32,14 +33,12 @@ #ifdef CONFIG_DYNAMIC_FTRACE -/* - * We generally only have a single long_branch tramp and at most 2 or 3 plt - * tramps generated. But, we don't use the plt tramps currently. We also allot - * 2 tramps after .text and .init.text. So, we only end up with around 3 usable - * tramps in total. Set aside 8 just to be sure. - */ -#defineNUM_FTRACE_TRAMPS 8 -static unsigned long ftrace_tramps[NUM_FTRACE_TRAMPS]; +static DEFINE_HASHTABLE(ppc_ftrace_stubs, 8); +struct ppc_ftrace_stub_data { + unsigned long addr; + unsigned long target; + struct hlist_node hentry; +}; static struct ppc_inst ftrace_call_replace(unsigned long ip, unsigned long addr, int link) @@ -288,36 +287,31 @@ __ftrace_make_nop(struct module *mod, #endif /* PPC64 */ #endif /* CONFIG_MODULES */ -static unsigned long find_ftrace_tramp(unsigned long ip) +static unsigned long find_ftrace_tramp(unsigned long ip, unsigned long target) { - int i; + struct ppc_ftrace_stub_data *stub; struct ppc_inst instr; - /* -* We have the compiler generated long_branch tramps at the end -* and we prefer those -*/ - for (i = NUM_FTRACE_TRAMPS - 1; i >= 0; i--) - if (!ftrace_tramps[i]) - continue; - else if (create_branch(, (void *)ip, - ftrace_tramps[i], 0) == 0) - return ftrace_tramps[i]; + hash_for_each_possible(ppc_ftrace_stubs, stub, hentry, target) + if (stub->target == target && !create_branch(, (void *)ip, stub->addr, 0)) + return stub->addr; return 0; } -static int add_ftrace_tramp(unsigned long tramp) +static int add_ftrace_tramp(unsigned long tramp, unsigned long target) { - int i; + struct ppc_ftrace_stub_data *stub; - for (i = 0; i < NUM_FTRACE_TRAMPS; i++) - if (!ftrace_tramps[i]) { - ftrace_tramps[i] = tramp; - return 0; - } + stub = kmalloc(sizeof(*stub), GFP_KERNEL); + if (!stub) + return -1; - return -1; + stub->addr = tramp; + stub->target = target; + hash_add(ppc_ftrace_stubs, >hentry, target); + + return 0; } /* @@ -328,16 +322,14 @@ static int add_ftrace_tramp(unsigned long tramp) */ static int setup_mcount_compiler_tramp(unsigned long tramp) { - int i; struct ppc_inst op; - unsigned long ptr; struct ppc_inst instr; + struct ppc_ftrace_stub_data *stub; + unsigned long ptr, ftrace_target = ppc_global_function_entry((void *)FTRACE_REGS_ADDR); /* Is this a known long jump tramp? */ - for (i = 0; i < NUM_FTRACE_TRAMPS; i++) - if (!ftrace_tramps[i]) - break; - else if (ftrace_tramps[i] == tramp) + hash_for_each_possible(ppc_ftrace_stubs, stub, hentry, ftrace_target) + if (stub->target == ftrace_target && stub->addr == tramp) return 0; /* New trampoline -- read where this goes */ @@ -361,19 +353,18 @@ static int setup_mcount_compiler_tramp(unsigned long tramp) } /* Let's re-write the tramp to go to ftrace_[regs_]caller */ - ptr = ppc_global_function_entry((void *)FTRACE_REGS_ADDR); - if (create_branch(, (void *)tramp, ptr, 0)) { + if (create_branch(, (void *)tramp, ftrace_target, 0)) { pr_debug("%ps is not reachable from existing mcount tramp\n", - (void *)ptr); + (void *)ftrace_target); return -1; } - if (patch_branch((struct ppc_inst *)tramp, ptr, 0)) { + if (patch_branch((struct ppc_inst *)tramp, ftrace_target, 0)) { pr_debug("REL24 out of range!\n"); return -1; } - if (add_ftrace_tramp(tramp)) { + if (add_ftrace_tramp(tramp, ftrace_target)) { pr_debug("No tramp locations left\n");
[RFC PATCH 11/14] powerpc/ftrace: Use GPR save/restore macros in ftrace_graph_caller()
Use SAVE_8GPRS(), REST_8GPRS() and _NIP(), along with using the standard SWITCH_FRAME_SIZE for the stack frame in ftrace_graph_caller() to simplify code. This increases the stack frame size, but it is unlikely to be an issue since ftrace_[regs_]caller() have just used a similar stack frame size, and it isn't evident that the graph caller has too deep a call stack to cause issues. Signed-off-by: Naveen N. Rao --- .../powerpc/kernel/trace/ftrace_64_mprofile.S | 28 +-- 1 file changed, 7 insertions(+), 21 deletions(-) diff --git a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S index f9fd5f743eba34..bbe871b47ade58 100644 --- a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S +++ b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S @@ -279,24 +279,17 @@ livepatch_handler: #ifdef CONFIG_FUNCTION_GRAPH_TRACER _GLOBAL(ftrace_graph_caller) - stdur1, -112(r1) + stdur1,-SWITCH_FRAME_SIZE(r1) /* with -mprofile-kernel, parameter regs are still alive at _mcount */ - std r10, 104(r1) - std r9, 96(r1) - std r8, 88(r1) - std r7, 80(r1) - std r6, 72(r1) - std r5, 64(r1) - std r4, 56(r1) - std r3, 48(r1) + SAVE_8GPRS(3, r1) /* Save callee's TOC in the ABI compliant location */ std r2, 24(r1) ld r2, PACATOC(r13)/* get kernel TOC in r2 */ - addir5, r1, 112 + addir5, r1, SWITCH_FRAME_SIZE mfctr r4 /* ftrace_caller has moved local addr here */ - std r4, 40(r1) + std r4, _NIP(r1) mflrr3 /* ftrace_caller has restored LR from stack */ subir4, r4, MCOUNT_INSN_SIZE @@ -309,21 +302,14 @@ _GLOBAL(ftrace_graph_caller) */ mtlrr3 - ld r0, 40(r1) + ld r0, _NIP(r1) mtctr r0 - ld r10, 104(r1) - ld r9, 96(r1) - ld r8, 88(r1) - ld r7, 80(r1) - ld r6, 72(r1) - ld r5, 64(r1) - ld r4, 56(r1) - ld r3, 48(r1) + REST_8GPRS(3, r1) /* Restore callee's TOC */ ld r2, 24(r1) - addir1, r1, 112 + addir1, r1, SWITCH_FRAME_SIZE mflrr0 std r0, LRSAVE(r1) bctr -- 2.25.4
[RFC PATCH 14/14] samples/ftrace: Add powerpc support for ftrace direct samples
Add a simple powerpc trampoline to demonstrate use of ftrace direct on powerpc. Signed-off-by: Naveen N. Rao --- samples/Kconfig | 2 +- samples/ftrace/ftrace-direct-modify.c | 58 +++ samples/ftrace/ftrace-direct-too.c| 48 -- samples/ftrace/ftrace-direct.c| 45 +++-- 4 files changed, 144 insertions(+), 9 deletions(-) diff --git a/samples/Kconfig b/samples/Kconfig index 0ed6e4d71d87b1..fdc9e44dba3b95 100644 --- a/samples/Kconfig +++ b/samples/Kconfig @@ -26,7 +26,7 @@ config SAMPLE_TRACE_PRINTK config SAMPLE_FTRACE_DIRECT tristate "Build register_ftrace_direct() example" depends on DYNAMIC_FTRACE_WITH_DIRECT_CALLS && m - depends on X86_64 # has x86_64 inlined asm + depends on X86_64 || PPC64 # has inlined asm help This builds an ftrace direct function example that hooks to wake_up_process and prints the parameters. diff --git a/samples/ftrace/ftrace-direct-modify.c b/samples/ftrace/ftrace-direct-modify.c index c13a5bc5095bea..89d66a12d300e1 100644 --- a/samples/ftrace/ftrace-direct-modify.c +++ b/samples/ftrace/ftrace-direct-modify.c @@ -2,6 +2,7 @@ #include #include #include +#include void my_direct_func1(void) { @@ -18,6 +19,7 @@ extern void my_tramp2(void *); static unsigned long my_ip = (unsigned long)schedule; +#ifdef CONFIG_X86 asm ( " .pushsection.text, \"ax\", @progbits\n" " .type my_tramp1, @function\n" @@ -38,6 +40,58 @@ asm ( " .size my_tramp2, .-my_tramp2\n" " .popsection\n" ); +#elif CONFIG_PPC64 +asm ( +" .pushsection.text, \"ax\", @progbits\n" +" .type my_tramp1, @function\n" +" .global my_tramp1\n" +" my_tramp1:\n" +" std 0, 16(1)\n" +" stdu1, -480(1)\n" +" std 2, 24(1)\n" +" mflr7\n" +" std 7, 368(1)\n" +" bcl 20, 31, 1f\n" +"1:mflr12\n" +" ld 2, (3f - 1b)(12)\n" +" bl my_direct_func1\n" +" nop\n" +" ld 2, 24(1)\n" +" ld 7, 368(1)\n" +" mtctr 7\n" +" addi1, 1, 480\n" +" ld 0, 16(1)\n" +" mtlr0\n" +" bctr\n" +" .size my_tramp1, .-my_tramp1\n" +" nop\n" +" .type my_tramp2, @function\n" +" .global my_tramp2\n" +" my_tramp2:\n" +" std 0, 16(1)\n" +" stdu1, -480(1)\n" +" std 2, 24(1)\n" +" mflr7\n" +" std 7, 368(1)\n" +" bcl 20, 31, 2f\n" +"2:mflr12\n" +" ld 2, (3f - 2b)(12)\n" +" bl my_direct_func2\n" +" nop\n" +" ld 2, 24(1)\n" +" ld 7, 368(1)\n" +" mtctr 7\n" +" addi1, 1, 480\n" +" ld 0, 16(1)\n" +" mtlr0\n" +" bctr\n" +" .size my_tramp2, .-my_tramp2\n" +"3:\n" +" .quad .TOC.@tocbase\n" +" .popsection\n" +); +#endif + static unsigned long my_tramp = (unsigned long)my_tramp1; static unsigned long tramps[2] = { @@ -72,6 +126,10 @@ static int __init ftrace_direct_init(void) { int ret; +#ifdef CONFIG_PPC64 + my_ip = ppc_function_entry((void *)my_ip) + 4; +#endif + ret = register_ftrace_direct(my_ip, my_tramp); if (!ret) simple_tsk = kthread_run(simple_thread, NULL, "event-sample-fn"); diff --git a/samples/ftrace/ftrace-direct-too.c b/samples/ftrace/ftrace-direct-too.c index d5c5022be66429..9a82abecbe0dcc 100644 --- a/samples/ftrace/ftrace-direct-too.c +++ b/samples/ftrace/ftrace-direct-too.c @@ -3,6 +3,7 @@ #include /* for handle_mm_fault() */ #include +#include void my_direct_func(struct vm_area_struct *vma, unsigned long address, unsigned int flags) @@ -13,6 +14,9 @@ void my_direct_func(struct vm_area_struct *vma, extern void my_tramp(void *); +static unsigned long my_ip = (unsigned long)handle_mm_fault; + +#ifdef CONFIG_X86 asm ( " .pushsection.text, \"ax\", @progbits\n" " .type my_tramp, @function\n" @@ -31,18 +35,54 @@ asm ( " .size my_tramp, .-my_tramp\n" " .popsection\n" ); +#elif CONFIG_PPC64 +asm ( +" .pushsection.text, \"ax\", @progbits\n" +" .type my_tramp, @function\n
[RFC PATCH 10/14] powerpc/ftrace: Drop assumptions about ftrace trampoline target
We currently assume that ftrace locations are patched to go to either ftrace_caller or ftrace_regs_caller. Drop this assumption in preparation for supporting ftrace direct calls. Signed-off-by: Naveen N. Rao --- arch/powerpc/kernel/trace/ftrace.c | 107 +++-- 1 file changed, 86 insertions(+), 21 deletions(-) diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c index 7ddb6e4b527c39..fcb21a9756e456 100644 --- a/arch/powerpc/kernel/trace/ftrace.c +++ b/arch/powerpc/kernel/trace/ftrace.c @@ -322,14 +322,15 @@ static int add_ftrace_tramp(unsigned long tramp, unsigned long target) */ static int setup_mcount_compiler_tramp(unsigned long tramp) { + int i; struct ppc_inst op; struct ppc_inst instr; struct ppc_ftrace_stub_data *stub; unsigned long ptr, ftrace_target = ppc_global_function_entry((void *)FTRACE_REGS_ADDR); - /* Is this a known long jump tramp? */ - hash_for_each_possible(ppc_ftrace_stubs, stub, hentry, ftrace_target) - if (stub->target == ftrace_target && stub->addr == tramp) + /* Is this a known tramp? */ + hash_for_each(ppc_ftrace_stubs, i, stub, hentry) + if (stub->addr == tramp) return 0; /* New trampoline -- read where this goes */ @@ -608,23 +609,16 @@ static int __ftrace_make_call_kernel(struct dyn_ftrace *rec, unsigned long addr) { struct ppc_inst op; void *ip = (void *)rec->ip; - unsigned long tramp, entry, ptr; + unsigned long tramp, ptr; - /* Make sure we're being asked to patch branch to a known ftrace addr */ - entry = ppc_global_function_entry((void *)ftrace_caller); ptr = ppc_global_function_entry((void *)addr); - if (ptr != entry) { #ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS - entry = ppc_global_function_entry((void *)ftrace_regs_caller); - if (ptr != entry) { + /* Make sure we branch to ftrace_regs_caller since we only setup stubs for that */ + tramp = ppc_global_function_entry((void *)ftrace_caller); + if (ptr == tramp) + ptr = ppc_global_function_entry((void *)FTRACE_REGS_ADDR); #endif - pr_err("Unknown ftrace addr to patch: %ps\n", (void *)ptr); - return -EINVAL; -#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS - } -#endif - } /* Make sure we have a nop */ if (probe_kernel_read_inst(, ip)) { @@ -637,7 +631,7 @@ static int __ftrace_make_call_kernel(struct dyn_ftrace *rec, unsigned long addr) return -EINVAL; } - tramp = find_ftrace_tramp((unsigned long)ip, FTRACE_REGS_ADDR); + tramp = find_ftrace_tramp((unsigned long)ip, ptr); if (!tramp) { pr_err("No ftrace trampolines reachable from %ps\n", ip); return -EINVAL; @@ -783,6 +777,81 @@ __ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, } #endif +static int +__ftrace_modify_call_kernel(struct dyn_ftrace *rec, unsigned long old_addr, unsigned long addr) +{ + struct ppc_inst op; + unsigned long ip = rec->ip; + unsigned long entry, ptr, tramp; + + /* read where this goes */ + if (probe_kernel_read_inst(, (void *)ip)) { + pr_err("Fetching opcode failed.\n"); + return -EFAULT; + } + + /* Make sure that this is still a 24bit jump */ + if (!is_bl_op(op)) { + pr_err("Not expected bl: opcode is %s\n", ppc_inst_as_str(op)); + return -EINVAL; + } + + /* lets find where the pointer goes */ + tramp = find_bl_target(ip, op); + entry = ppc_global_function_entry((void *)old_addr); + + pr_devel("ip:%lx jumps to %lx", ip, tramp); + + if (tramp != entry) { + /* old_addr is not within range, so we must have used a trampoline */ + struct ppc_ftrace_stub_data *stub; + + hash_for_each_possible(ppc_ftrace_stubs, stub, hentry, entry) + if (stub->target == entry && stub->addr == tramp) + break; + + if (stub->target != entry || stub->addr != tramp) { + pr_err("we don't know about the tramp at %lx!\n", tramp); + return -EFAULT; + } + } + + /* The new target may be within range */ + if (test_24bit_addr(ip, addr)) { + /* within range */ + if (patch_branch((struct ppc_inst *)ip, addr, BRANCH_SET_LINK)) { + pr_err("REL24 out of range!\n"); + return -EINVAL; + } + + return 0; + } + + ptr = ppc_global_function_entry((void *)addr); + +#ifd
[RFC PATCH 13/14] powerpc/ftrace: Add support for register_ftrace_direct() for MPROFILE_KERNEL
Add support for register_ftrace_direct() for MPROFILE_KERNEL, as it depends on DYNAMIC_FTRACE_WITH_REGS. Since powerpc only provides a branch range of 32MB, we set aside a 64k area within kernel text for creating stubs that can be used to branch to the provided trampoline, which can be located in the module area. This is limited to kernel text, and as such, ftrace direct calls are not supported for functions in kernel modules at this time. We use orig_gpr3 to stash the address of the direct call trampoline in arch_ftrace_set_direct_caller(). ftrace_regs_caller() is updated to check for this to determine if we need to redirect to a direct call trampoline. As the direct call trampoline has to work as an alternative for the ftrace trampoline, we setup LR and r0 appropriately, and update ctr to the trampoline address. Finally, ftrace_graph_caller() is updated to save/restore r0. Signed-off-by: Naveen N. Rao --- arch/powerpc/Kconfig | 1 + arch/powerpc/include/asm/ftrace.h | 14 ++ arch/powerpc/kernel/trace/ftrace.c| 140 +- .../powerpc/kernel/trace/ftrace_64_mprofile.S | 40 - 4 files changed, 182 insertions(+), 13 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index cfc6dd787f532c..a87ac2e403196e 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -197,6 +197,7 @@ config PPC select HAVE_DEBUG_KMEMLEAK select HAVE_DEBUG_STACKOVERFLOW select HAVE_DYNAMIC_FTRACE + select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLSif MPROFILE_KERNEL select HAVE_DYNAMIC_FTRACE_WITH_REGSif MPROFILE_KERNEL select HAVE_EBPF_JITif PPC64 select HAVE_EFFICIENT_UNALIGNED_ACCESS if !(CPU_LITTLE_ENDIAN && POWER7_CPU) diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h index bc76970b6ee532..2f1c46e9f5d416 100644 --- a/arch/powerpc/include/asm/ftrace.h +++ b/arch/powerpc/include/asm/ftrace.h @@ -10,6 +10,8 @@ #define HAVE_FUNCTION_GRAPH_RET_ADDR_PTR +#define FTRACE_STUBS_SIZE 65536 + #ifdef __ASSEMBLY__ /* Based off of objdump optput from glibc */ @@ -59,6 +61,18 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr) struct dyn_arch_ftrace { struct module *mod; }; + +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +/* + * When there is a direct caller registered, we use regs->orig_gpr3 (similar to + * how x86 uses orig_ax) to let ftrace_{regs_}_caller know that we should go + * there instead of returning to the function + */ +static inline void arch_ftrace_set_direct_caller(struct pt_regs *regs, unsigned long addr) +{ + regs->orig_gpr3 = addr; +} +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ #endif /* __ASSEMBLY__ */ #ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c index fcb21a9756e456..815b14ae45a71f 100644 --- a/arch/powerpc/kernel/trace/ftrace.c +++ b/arch/powerpc/kernel/trace/ftrace.c @@ -37,6 +37,7 @@ static DEFINE_HASHTABLE(ppc_ftrace_stubs, 8); struct ppc_ftrace_stub_data { unsigned long addr; unsigned long target; + refcount_t refs; struct hlist_node hentry; }; @@ -299,7 +300,7 @@ static unsigned long find_ftrace_tramp(unsigned long ip, unsigned long target) return 0; } -static int add_ftrace_tramp(unsigned long tramp, unsigned long target) +static int add_ftrace_tramp(unsigned long tramp, unsigned long target, int lock) { struct ppc_ftrace_stub_data *stub; @@ -309,11 +310,123 @@ static int add_ftrace_tramp(unsigned long tramp, unsigned long target) stub->addr = tramp; stub->target = target; + refcount_set(>refs, 1); + if (lock) + refcount_inc(>refs); hash_add(ppc_ftrace_stubs, >hentry, target); return 0; } +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +static u32 ftrace_direct_stub_insns[] = { + PPC_RAW_LIS(12, 0), + PPC_RAW_ORI(12, 12, 0), + PPC_RAW_SLDI(12, 12, 32), + PPC_RAW_ORIS(12, 12, 0), + PPC_RAW_ORI(12, 12, 0), + PPC_RAW_MTCTR(12), + PPC_RAW_BCTR(), +}; +#define FTRACE_NUM_STUBS (FTRACE_STUBS_SIZE / sizeof(ftrace_direct_stub_insns)) +static DECLARE_BITMAP(stubs_bitmap, FTRACE_NUM_STUBS); +extern unsigned int ftrace_stubs[]; + +static unsigned long get_ftrace_tramp(unsigned long ip, unsigned long target) +{ + struct ppc_ftrace_stub_data *stub_data; + struct ppc_inst instr; + unsigned int *stub; + int index; + + hash_for_each_possible(ppc_ftrace_stubs, stub_data, hentry, target) { + if (stub_data->target == target && + !create_branch(, (void *)ip, stub_data->addr, 0)) { + refcount_inc(_data->refs); + return stub_data->addr; +
[RFC PATCH 12/14] powerpc/ftrace: Drop saving LR to stack save area for -mprofile-kernel
In -mprofile-kernel variant of ftrace_graph_caller(), we save the optionally-updated LR address into the stack save area at the end. This is likely an offshoot of the initial -mprofile-kernel implementation in gcc emitting the same as part of the -mprofile-kernel instruction sequence. However, this is not required. Drop it. Signed-off-by: Naveen N. Rao --- arch/powerpc/kernel/trace/ftrace_64_mprofile.S | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S index bbe871b47ade58..c5602e9b07faa3 100644 --- a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S +++ b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S @@ -310,7 +310,5 @@ _GLOBAL(ftrace_graph_caller) ld r2, 24(r1) addir1, r1, SWITCH_FRAME_SIZE - mflrr0 - std r0, LRSAVE(r1) bctr #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ -- 2.25.4
[RFC PATCH 08/14] powerpc/ftrace: Use FTRACE_REGS_ADDR to identify the correct ftrace trampoline
Use FTRACE_REGS_ADDR instead of keying off CONFIG_DYNAMIC_FTRACE_WITH_REGS to identify the proper ftrace trampoline address to use. Signed-off-by: Naveen N. Rao --- arch/powerpc/kernel/trace/ftrace.c | 12 ++-- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c index 4fe5f373172fd2..14b39f7797d455 100644 --- a/arch/powerpc/kernel/trace/ftrace.c +++ b/arch/powerpc/kernel/trace/ftrace.c @@ -361,11 +361,7 @@ static int setup_mcount_compiler_tramp(unsigned long tramp) } /* Let's re-write the tramp to go to ftrace_[regs_]caller */ -#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS - ptr = ppc_global_function_entry((void *)ftrace_regs_caller); -#else - ptr = ppc_global_function_entry((void *)ftrace_caller); -#endif + ptr = ppc_global_function_entry((void *)FTRACE_REGS_ADDR); if (create_branch(, (void *)tramp, ptr, 0)) { pr_debug("%ps is not reachable from existing mcount tramp\n", (void *)ptr); @@ -885,11 +881,7 @@ int __init ftrace_dyn_arch_init(void) 0x7d8903a6, /* mtctr r12 */ 0x4e800420, /* bctr */ }; -#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS - unsigned long addr = ppc_global_function_entry((void *)ftrace_regs_caller); -#else - unsigned long addr = ppc_global_function_entry((void *)ftrace_caller); -#endif + unsigned long addr = ppc_global_function_entry((void *)FTRACE_REGS_ADDR); long reladdr = addr - kernel_toc_addr(); if (reladdr > 0x7FFF || reladdr < -(0x8000L)) { -- 2.25.4
[RFC PATCH 07/14] powerpc/ftrace: Remove dead code
ftrace_plt_tramps[] was intended to speed up skipping plt branches, but the code wasn't completed. It is also not significantly better than reading and decoding the instruction. Remove the same. Signed-off-by: Naveen N. Rao --- arch/powerpc/kernel/trace/ftrace.c | 8 1 file changed, 8 deletions(-) diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c index 42761ebec9f755..4fe5f373172fd2 100644 --- a/arch/powerpc/kernel/trace/ftrace.c +++ b/arch/powerpc/kernel/trace/ftrace.c @@ -332,7 +332,6 @@ static int setup_mcount_compiler_tramp(unsigned long tramp) struct ppc_inst op; unsigned long ptr; struct ppc_inst instr; - static unsigned long ftrace_plt_tramps[NUM_FTRACE_TRAMPS]; /* Is this a known long jump tramp? */ for (i = 0; i < NUM_FTRACE_TRAMPS; i++) @@ -341,13 +340,6 @@ static int setup_mcount_compiler_tramp(unsigned long tramp) else if (ftrace_tramps[i] == tramp) return 0; - /* Is this a known plt tramp? */ - for (i = 0; i < NUM_FTRACE_TRAMPS; i++) - if (!ftrace_plt_tramps[i]) - break; - else if (ftrace_plt_tramps[i] == tramp) - return -1; - /* New trampoline -- read where this goes */ if (probe_kernel_read_inst(, (void *)tramp)) { pr_debug("Fetching opcode failed.\n"); -- 2.25.4
[RFC PATCH 06/14] powerpc: Add support for CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
Add register_get_kernel_argument() for a rudimentary way to access kernel function arguments. Signed-off-by: Naveen N. Rao --- arch/powerpc/Kconfig | 1 + arch/powerpc/include/asm/ptrace.h | 31 +++ 2 files changed, 32 insertions(+) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index e9f13fe084929b..cfc6dd787f532c 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -202,6 +202,7 @@ config PPC select HAVE_EFFICIENT_UNALIGNED_ACCESS if !(CPU_LITTLE_ENDIAN && POWER7_CPU) select HAVE_FAST_GUP select HAVE_FTRACE_MCOUNT_RECORD + select HAVE_FUNCTION_ARG_ACCESS_API select HAVE_FUNCTION_ERROR_INJECTION select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACER diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h index e2c778c176a3a6..956828c07abd70 100644 --- a/arch/powerpc/include/asm/ptrace.h +++ b/arch/powerpc/include/asm/ptrace.h @@ -62,6 +62,8 @@ struct pt_regs }; #endif +#define NR_REG_ARGUMENTS 8 + #ifdef __powerpc64__ /* @@ -85,8 +87,10 @@ struct pt_regs #ifdef PPC64_ELF_ABI_v2 #define STACK_FRAME_MIN_SIZE 32 +#define STACK_FRAME_PARM_SAVE 32 #else #define STACK_FRAME_MIN_SIZE STACK_FRAME_OVERHEAD +#define STACK_FRAME_PARM_SAVE 48 #endif /* Size of dummy stack frame allocated when calling signal handler. */ @@ -103,6 +107,7 @@ struct pt_regs #define STACK_INT_FRAME_SIZE (sizeof(struct pt_regs) + STACK_FRAME_OVERHEAD) #define STACK_FRAME_MARKER 2 #define STACK_FRAME_MIN_SIZE STACK_FRAME_OVERHEAD +#define STACK_FRAME_PARM_SAVE 8 /* Size of stack frame allocated when calling signal handler. */ #define __SIGNAL_FRAMESIZE 64 @@ -309,6 +314,32 @@ static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, return 0; } +/** + * regs_get_kernel_argument() - get Nth function argument in kernel + * @regs: pt_regs of that context + * @n: function argument number (start from 0) + * + * regs_get_argument() returns @n th argument of the function call. + * Note that this chooses most probable assignment, and is incorrect + * in scenarios where double or fp/vector parameters are involved. + * This also doesn't take into account stack alignment requirements. + * + * This is expected to be called from kprobes or ftrace with regs + * at function entry, so the current function has not setup its stack. + */ +static inline unsigned long regs_get_kernel_argument(struct pt_regs *regs, +unsigned int n) +{ + if (n >= NR_REG_ARGUMENTS) { +#ifndef __powerpc64__ + n -= NR_REG_ARGUMENTS; +#endif + n += STACK_FRAME_PARM_SAVE / sizeof(unsigned long); + return regs_get_kernel_stack_nth(regs, n); + } else { + return regs_get_register(regs, offsetof(struct pt_regs, gpr[n + 3])); + } +} #endif /* __ASSEMBLY__ */ #ifndef __powerpc64__ -- 2.25.4
[RFC PATCH 03/14] ftrace: Fix cleanup in error path of register_ftrace_direct()
We need to remove hash entry if register_ftrace_function() fails. Consolidate the cleanup to be done after register_ftrace_function() at the end. Signed-off-by: Naveen N. Rao --- kernel/trace/ftrace.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 9c1bba8cc51b03..3844a4a1346a9c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5136,8 +5136,6 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) __add_hash_entry(direct_functions, entry); ret = ftrace_set_filter_ip(_ops, ip, 0, 0); - if (ret) - remove_hash_entry(direct_functions, entry); if (!ret && !(direct_ops.flags & FTRACE_OPS_FL_ENABLED)) { ret = register_ftrace_function(_ops); @@ -5146,6 +5144,7 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) } if (ret) { + remove_hash_entry(direct_functions, entry); kfree(entry); if (!direct->count) { list_del_rcu(>next); -- 2.25.4
[RFC PATCH 02/14] ftrace: Fix DYNAMIC_FTRACE_WITH_DIRECT_CALLS dependency
DYNAMIC_FTRACE_WITH_DIRECT_CALLS should depend on DYNAMIC_FTRACE_WITH_REGS since we need ftrace_regs_caller(). Fixes: 763e34e74bb7d5c ("ftrace: Add register_ftrace_direct()") Signed-off-by: Naveen N. Rao --- kernel/trace/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index a4020c0b4508c9..e1bf5228fb692a 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -202,7 +202,7 @@ config DYNAMIC_FTRACE_WITH_REGS config DYNAMIC_FTRACE_WITH_DIRECT_CALLS def_bool y - depends on DYNAMIC_FTRACE + depends on DYNAMIC_FTRACE_WITH_REGS depends on HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS config FUNCTION_PROFILER -- 2.25.4
[RFC PATCH 05/14] ftrace: Add architectural helpers for [un]register_ftrace_direct()
Architectures may want to do some validation (such as to ensure that the trampoline code is reachable from the provided ftrace location) before accepting ftrace direct registration. Add helpers for the same. Signed-off-by: Naveen N. Rao --- include/linux/ftrace.h | 2 ++ kernel/trace/ftrace.c | 27 +-- 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 46b4b7ee28c41f..3fdcb4c513bc2d 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -290,6 +290,8 @@ int ftrace_modify_direct_caller(struct ftrace_func_entry *entry, unsigned long old_addr, unsigned long new_addr); unsigned long ftrace_find_rec_direct(unsigned long ip); +int arch_register_ftrace_direct(unsigned long ip, unsigned long addr); +void arch_unregister_ftrace_direct(unsigned long ip, unsigned long addr); #else # define ftrace_direct_func_count 0 static inline int register_ftrace_direct(unsigned long ip, unsigned long addr) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 7476f2458b6d95..0e259b90527722 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5005,6 +5005,13 @@ ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove, } #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +int __weak arch_register_ftrace_direct(unsigned long ip, unsigned long addr) +{ + return 0; +} + +void __weak arch_unregister_ftrace_direct(unsigned long ip, unsigned long addr) { } + /** * register_ftrace_direct - Call a custom trampoline directly * @ip: The address of the nop at the beginning of a function @@ -5028,6 +5035,7 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) struct ftrace_hash *free_hash = NULL; struct dyn_ftrace *rec; int ret = -EBUSY; + int arch_ret; mutex_lock(_mutex); @@ -5082,18 +5090,24 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) entry->direct = addr; __add_hash_entry(direct_functions, entry); - ret = ftrace_set_filter_ip(_ops, ip, 0, 0); + arch_ret = arch_register_ftrace_direct(ip, addr); - if (!ret && !(direct_ops.flags & FTRACE_OPS_FL_ENABLED)) { - ret = register_ftrace_function(_ops); - if (ret) - ftrace_set_filter_ip(_ops, ip, 1, 0); + if (!arch_ret) { + ret = ftrace_set_filter_ip(_ops, ip, 0, 0); + + if (!ret && !(direct_ops.flags & FTRACE_OPS_FL_ENABLED)) { + ret = register_ftrace_function(_ops); + if (ret) + ftrace_set_filter_ip(_ops, ip, 1, 0); + } } - if (ret) { + if (arch_ret || ret) { remove_hash_entry(direct_functions, entry); ftrace_direct_func_count--; kfree(entry); + if (!arch_ret) + arch_unregister_ftrace_direct(ip, addr); } out_unlock: mutex_unlock(_mutex); @@ -5155,6 +5169,7 @@ int unregister_ftrace_direct(unsigned long ip, unsigned long addr) remove_hash_entry(direct_functions, entry); ftrace_direct_func_count--; kfree(entry); + arch_unregister_ftrace_direct(ip, addr); out_unlock: mutex_unlock(_mutex); -- 2.25.4
[RFC PATCH 04/14] ftrace: Remove ftrace_find_direct_func()
This is a revert of commit 013bf0da047481 ("ftrace: Add ftrace_find_direct_func()") ftrace_find_direct_func() was meant for use in the function graph tracer by architecture specific code. However, commit ff205766dbbee0 ("ftrace: Fix function_graph tracer interaction with BPF trampoline") disabled function graph tracer for direct calls leaving this without any users. In addition, modify_ftrace_direct() allowed redirecting the direct call to a different trampoline that was never registered through register_ftrace_direct(). This meant that ftrace_direct_funcs didn't capture all trampolines. Signed-off-by: Naveen N. Rao --- include/linux/ftrace.h | 5 --- kernel/trace/ftrace.c | 84 ++ 2 files changed, 4 insertions(+), 85 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 1bd3a0356ae478..46b4b7ee28c41f 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -285,7 +285,6 @@ extern int ftrace_direct_func_count; int register_ftrace_direct(unsigned long ip, unsigned long addr); int unregister_ftrace_direct(unsigned long ip, unsigned long addr); int modify_ftrace_direct(unsigned long ip, unsigned long old_addr, unsigned long new_addr); -struct ftrace_direct_func *ftrace_find_direct_func(unsigned long addr); int ftrace_modify_direct_caller(struct ftrace_func_entry *entry, struct dyn_ftrace *rec, unsigned long old_addr, @@ -306,10 +305,6 @@ static inline int modify_ftrace_direct(unsigned long ip, { return -ENOTSUPP; } -static inline struct ftrace_direct_func *ftrace_find_direct_func(unsigned long addr) -{ - return NULL; -} static inline int ftrace_modify_direct_caller(struct ftrace_func_entry *entry, struct dyn_ftrace *rec, unsigned long old_addr, diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 3844a4a1346a9c..7476f2458b6d95 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5005,46 +5005,6 @@ ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove, } #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS - -struct ftrace_direct_func { - struct list_headnext; - unsigned long addr; - int count; -}; - -static LIST_HEAD(ftrace_direct_funcs); - -/** - * ftrace_find_direct_func - test an address if it is a registered direct caller - * @addr: The address of a registered direct caller - * - * This searches to see if a ftrace direct caller has been registered - * at a specific address, and if so, it returns a descriptor for it. - * - * This can be used by architecture code to see if an address is - * a direct caller (trampoline) attached to a fentry/mcount location. - * This is useful for the function_graph tracer, as it may need to - * do adjustments if it traced a location that also has a direct - * trampoline attached to it. - */ -struct ftrace_direct_func *ftrace_find_direct_func(unsigned long addr) -{ - struct ftrace_direct_func *entry; - bool found = false; - - /* May be called by fgraph trampoline (protected by rcu tasks) */ - list_for_each_entry_rcu(entry, _direct_funcs, next) { - if (entry->addr == addr) { - found = true; - break; - } - } - if (found) - return entry; - - return NULL; -} - /** * register_ftrace_direct - Call a custom trampoline directly * @ip: The address of the nop at the beginning of a function @@ -5064,7 +5024,6 @@ struct ftrace_direct_func *ftrace_find_direct_func(unsigned long addr) */ int register_ftrace_direct(unsigned long ip, unsigned long addr) { - struct ftrace_direct_func *direct; struct ftrace_func_entry *entry; struct ftrace_hash *free_hash = NULL; struct dyn_ftrace *rec; @@ -5118,19 +5077,7 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) if (!entry) goto out_unlock; - direct = ftrace_find_direct_func(addr); - if (!direct) { - direct = kmalloc(sizeof(*direct), GFP_KERNEL); - if (!direct) { - kfree(entry); - goto out_unlock; - } - direct->addr = addr; - direct->count = 0; - list_add_rcu(>next, _direct_funcs); - ftrace_direct_func_count++; - } - + ftrace_direct_func_count++; entry->ip = ip; entry->direct = addr; __add_hash_entry(direct_functions, entry); @@ -5145,18 +5092,8 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) if (ret) { remove_hash_entry(direct_functions, entry); + ftrace_direct_func_count--; kfree(entr
[RFC PATCH 01/14] ftrace: Fix updating FTRACE_FL_TRAMP
On powerpc, kprobe-direct.tc triggered FTRACE_WARN_ON() in ftrace_get_addr_new() followed by the below message: Bad trampoline accounting at: 4222522f (wake_up_process+0xc/0x20) (f001) The set of steps leading to this involved: - modprobe ftrace-direct-too - enable_probe - modprobe ftrace-direct - rmmod ftrace-direct <-- trigger The problem turned out to be that we were not updating flags in the ftrace record properly. From the above message about the trampoline accounting being bad, it can be seen that the ftrace record still has FTRACE_FL_TRAMP set though ftrace-direct module is going away. This happens because we are checking if any ftrace_ops has the FTRACE_FL_TRAMP flag set _before_ updating the filter hash. The fix for this is to look for any _other_ ftrace_ops that also needs FTRACE_FL_TRAMP. Signed-off-by: Naveen N. Rao --- kernel/trace/ftrace.c | 22 +- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 8185f7240095f4..9c1bba8cc51b03 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1629,6 +1629,8 @@ static bool test_rec_ops_needs_regs(struct dyn_ftrace *rec) static struct ftrace_ops * ftrace_find_tramp_ops_any(struct dyn_ftrace *rec); static struct ftrace_ops * +ftrace_find_tramp_ops_any_other(struct dyn_ftrace *rec, struct ftrace_ops *op_exclude); +static struct ftrace_ops * ftrace_find_tramp_ops_next(struct dyn_ftrace *rec, struct ftrace_ops *ops); static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, @@ -1778,7 +1780,7 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, * to it. */ if (ftrace_rec_count(rec) == 1 && - ftrace_find_tramp_ops_any(rec)) + ftrace_find_tramp_ops_any_other(rec, ops)) rec->flags |= FTRACE_FL_TRAMP; else rec->flags &= ~FTRACE_FL_TRAMP; @@ -2244,6 +2246,24 @@ ftrace_find_tramp_ops_any(struct dyn_ftrace *rec) return NULL; } +static struct ftrace_ops * +ftrace_find_tramp_ops_any_other(struct dyn_ftrace *rec, struct ftrace_ops *op_exclude) +{ + struct ftrace_ops *op; + unsigned long ip = rec->ip; + + do_for_each_ftrace_op(op, ftrace_ops_list) { + + if (op == op_exclude || !op->trampoline) + continue; + + if (hash_contains_ip(ip, op->func_hash)) + return op; + } while_for_each_ftrace_op(op); + + return NULL; +} + static struct ftrace_ops * ftrace_find_tramp_ops_next(struct dyn_ftrace *rec, struct ftrace_ops *op) -- 2.25.4
[RFC PATCH 00/14] powerpc64: Add support for ftrace direct calls
This series adds support for DYNAMIC_FTRACE_WITH_DIRECT_CALLS for powerpc64le. This is mostly working fine for me, except for a soft lockup I see with the ftrace direct selftest. It happens when irqsoff tracer is being tested with the ftrace direct modules. This appears to be an existing upstream issue since I am able to reproduce the lockup without these patches. I will be looking into that to see if I can figure out the cause of those lockups. In the meantime, I would appreciate a review of these patches. - Naveen Naveen N. Rao (14): ftrace: Fix updating FTRACE_FL_TRAMP ftrace: Fix DYNAMIC_FTRACE_WITH_DIRECT_CALLS dependency ftrace: Fix cleanup in error path of register_ftrace_direct() ftrace: Remove ftrace_find_direct_func() ftrace: Add architectural helpers for [un]register_ftrace_direct() powerpc: Add support for CONFIG_HAVE_FUNCTION_ARG_ACCESS_API powerpc/ftrace: Remove dead code powerpc/ftrace: Use FTRACE_REGS_ADDR to identify the correct ftrace trampoline powerpc/ftrace: Use a hash table for tracking ftrace stubs powerpc/ftrace: Drop assumptions about ftrace trampoline target powerpc/ftrace: Use GPR save/restore macros in ftrace_graph_caller() powerpc/ftrace: Drop saving LR to stack save area for -mprofile-kernel powerpc/ftrace: Add support for register_ftrace_direct() for MPROFILE_KERNEL samples/ftrace: Add powerpc support for ftrace direct samples arch/powerpc/Kconfig | 2 + arch/powerpc/include/asm/ftrace.h | 14 + arch/powerpc/include/asm/ptrace.h | 31 ++ arch/powerpc/kernel/trace/ftrace.c| 314 +- .../powerpc/kernel/trace/ftrace_64_mprofile.S | 70 ++-- include/linux/ftrace.h| 7 +- kernel/trace/Kconfig | 2 +- kernel/trace/ftrace.c | 130 +++- samples/Kconfig | 2 +- samples/ftrace/ftrace-direct-modify.c | 58 samples/ftrace/ftrace-direct-too.c| 48 ++- samples/ftrace/ftrace-direct.c| 45 ++- 12 files changed, 519 insertions(+), 204 deletions(-) base-commit: 4c202167192a77481310a3cacae9f12618b92216 -- 2.25.4
Re: [PATCH 1/3 v5] ftrace: Have the callbacks receive a struct ftrace_regs instead of pt_regs
Hi Steven, Steven Rostedt wrote: From: "Steven Rostedt (VMware)" In preparation to have arguments of a function passed to callbacks attached to functions as default, change the default callback prototype to receive a struct ftrace_regs as the forth parameter instead of a pt_regs. For callbacks that set the FL_SAVE_REGS flag in their ftrace_ops flags, they will now need to get the pt_regs via a ftrace_get_regs() helper call. If this is called by a callback that their ftrace_ops did not have a FL_SAVE_REGS flag set, it that helper function will return NULL. This will allow the ftrace_regs to hold enough just to get the parameters and stack pointer, but without the worry that callbacks may have a pt_regs that is not completely filled. Reviewed-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- arch/x86/kernel/kprobes/ftrace.c | 3 ++- fs/pstore/ftrace.c| 2 +- include/linux/ftrace.h| 16 ++-- include/linux/kprobes.h | 2 +- kernel/livepatch/patch.c | 3 ++- kernel/trace/ftrace.c | 27 +++ kernel/trace/trace_event_perf.c | 2 +- kernel/trace/trace_events.c | 2 +- kernel/trace/trace_functions.c| 9 - kernel/trace/trace_irqsoff.c | 2 +- kernel/trace/trace_sched_wakeup.c | 2 +- kernel/trace/trace_selftest.c | 20 +++- kernel/trace/trace_stack.c| 2 +- 13 files changed, 55 insertions(+), 37 deletions(-) diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 629abaf25681..be73350955e4 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -345,7 +345,7 @@ static inline void wait_for_kprobe_optimizer(void) { } #endif /* CONFIG_OPTPROBES */ #ifdef CONFIG_KPROBES_ON_FTRACE extern void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *ops, struct pt_regs *regs); + struct ftrace_ops *ops, struct ftrace_regs *fregs); This breaks the build on non-x86 architectures that support KPROBES_ON_FTRACE. - Naveen
Re: [PATCH v3 8/9] perf mem: Return NULL for event 'ldst' on PowerPC
[+ Maddy] Leo Yan wrote: If user specifies event type "ldst", PowerPC's perf_mem_events__name() will wrongly return the store event name "cpu/mem-stores/". This patch changes to return NULL for the event "ldst" on PowerPC. Signed-off-by: Leo Yan --- tools/perf/arch/powerpc/util/mem-events.c | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/perf/arch/powerpc/util/mem-events.c b/tools/perf/arch/powerpc/util/mem-events.c index 07fb5e049488..90c5a0760685 100644 --- a/tools/perf/arch/powerpc/util/mem-events.c +++ b/tools/perf/arch/powerpc/util/mem-events.c @@ -7,6 +7,8 @@ char *perf_mem_events__name(int i) { if (i == PERF_MEM_EVENTS__LOAD) return (char *) "cpu/mem-loads/"; - - return (char *) "cpu/mem-stores/"; + else if (i == PERF_MEM_EVENTS__STORE) + return (char *) "cpu/mem-stores/"; + else + return NULL; } -- 2.17.1
Re: [PATCH v1] powerpc/process: Remove unnecessary #ifdef CONFIG_FUNCTION_GRAPH_TRACER
Christophe Leroy wrote: ftrace_graph_ret_addr() is always defined and returns 'ip' when CONFIG_FUNCTION GRAPH_TRACER is not set. So the #ifdef is not needed, remove it. Signed-off-by: Christophe Leroy --- arch/powerpc/kernel/process.c | 4 1 file changed, 4 deletions(-) LGTM. Acked-by: Naveen N. Rao - Naveen
Re: [PATCH] MAINTAINERS: adjust kprobes.rst entry to new location
Lukas Bulwahn wrote: Commit 2165b82fde82 ("docs: Move kprobes.rst from staging/ to trace/") moved kprobes.rst, but missed to adjust the MAINTAINERS entry. Hence, ./scripts/get_maintainer.pl --self-test=patterns complains: warning: no file matchesF:Documentation/staging/kprobes.rst Adjust the entry to the new file location. Signed-off-by: Lukas Bulwahn --- Naveen, Masami-san, please ack. Jonathan, please pick this minor non-urgent patch into docs-next. applies cleanly on next-20200724 Ah, sorry. Hadn't noticed this change from Mauro. Acked-by: Naveen N. Rao Thanks! - Naveen
[PATCH 2/3] docs: staging/kprobes.rst: Move references to a separate appendix
Kprobes references are currently listed right after kretprobes example, and appears to be part of the same section. Move this out to a separate appendix for clarity. Signed-off-by: Naveen N. Rao --- Documentation/staging/kprobes.rst | 14 +- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/Documentation/staging/kprobes.rst b/Documentation/staging/kprobes.rst index 620699f5df22..b757b6dfd3d4 100644 --- a/Documentation/staging/kprobes.rst +++ b/Documentation/staging/kprobes.rst @@ -20,6 +20,7 @@ Kernel Probes (Kprobes) 10. Deprecated Features Appendix A: The kprobes debugfs interface Appendix B: The kprobes sysctl interface + Appendix C: References Concepts: Kprobes and Return Probes = @@ -710,11 +711,6 @@ Kretprobes Example See samples/kprobes/kretprobe_example.c -For additional information on Kprobes, refer to the following URLs: - -- https://www.ibm.com/developerworks/library/l-kprobes/index.html -- https://www.kernel.org/doc/ols/2006/ols2006v2-pages-109-124.pdf - Deprecated Features === @@ -797,3 +793,11 @@ Note that this knob *changes* the optimized state. This means that optimized probes (marked [OPTIMIZED]) will be unoptimized ([OPTIMIZED] tag will be removed). If the knob is turned on, they will be optimized again. +References +== + +For additional information on Kprobes, refer to the following URLs: + +- https://www.ibm.com/developerworks/library/l-kprobes/index.html +- https://www.kernel.org/doc/ols/2006/ols2006v2-pages-109-124.pdf + -- 2.25.4
[PATCH 3/3] docs: Move kprobes.rst from staging/ to trace/
Kprobes contitutes a dynamic tracing technology and as such can be moved alongside documentation of other tracing technologies. Signed-off-by: Naveen N. Rao --- Documentation/staging/index.rst | 1 - Documentation/trace/index.rst| 1 + Documentation/{staging => trace}/kprobes.rst | 0 3 files changed, 1 insertion(+), 1 deletion(-) rename Documentation/{staging => trace}/kprobes.rst (100%) diff --git a/Documentation/staging/index.rst b/Documentation/staging/index.rst index 184e6aece0a7..abd0d18254d2 100644 --- a/Documentation/staging/index.rst +++ b/Documentation/staging/index.rst @@ -7,7 +7,6 @@ Unsorted Documentation :maxdepth: 2 crc32 - kprobes lzo remoteproc rpmsg diff --git a/Documentation/trace/index.rst b/Documentation/trace/index.rst index 7d83156c9ac1..879ebb9f647d 100644 --- a/Documentation/trace/index.rst +++ b/Documentation/trace/index.rst @@ -9,6 +9,7 @@ Linux Tracing Technologies tracepoint-analysis ftrace ftrace-uses + kprobes kprobetrace uprobetracer tracepoints diff --git a/Documentation/staging/kprobes.rst b/Documentation/trace/kprobes.rst similarity index 100% rename from Documentation/staging/kprobes.rst rename to Documentation/trace/kprobes.rst -- 2.25.4
[PATCH 0/3] docs: kprobes: Update URLs and move under trace/
This series updates some of the URLs in the kprobes document and moves the same under trace/ directory. - Naveen Naveen N. Rao (3): docs: staging/kprobes.rst: Update some of the references docs: staging/kprobes.rst: Move references to a separate appendix docs: Move kprobes.rst from staging/ to trace/ Documentation/staging/index.rst | 1 - Documentation/trace/index.rst| 1 + Documentation/{staging => trace}/kprobes.rst | 16 +--- 3 files changed, 10 insertions(+), 8 deletions(-) rename Documentation/{staging => trace}/kprobes.rst (99%) base-commit: f33d4075e512274c0c6edcd7602da2cf1d5a0341 -- 2.25.4
[PATCH 1/3] docs: staging/kprobes.rst: Update some of the references
Some of the kprobes references are not valid anymore. Update the URLs to point to their changed locations, where appropriate. Drop two URLs which do not exist anymore. Reported-by: Masami Hiramatsu Signed-off-by: Naveen N. Rao --- Documentation/staging/kprobes.rst | 6 ++ 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/Documentation/staging/kprobes.rst b/Documentation/staging/kprobes.rst index 8baab8832c5b..620699f5df22 100644 --- a/Documentation/staging/kprobes.rst +++ b/Documentation/staging/kprobes.rst @@ -712,10 +712,8 @@ See samples/kprobes/kretprobe_example.c For additional information on Kprobes, refer to the following URLs: -- http://www-106.ibm.com/developerworks/library/l-kprobes.html?ca=dgr-lnxw42Kprobe -- http://www.redhat.com/magazine/005mar05/features/kprobes/ -- http://www-users.cs.umn.edu/~boutcher/kprobes/ -- http://www.linuxsymposium.org/2006/linuxsymposium_procv2.pdf (pages 101-115) +- https://www.ibm.com/developerworks/library/l-kprobes/index.html +- https://www.kernel.org/doc/ols/2006/ols2006v2-pages-109-124.pdf Deprecated Features === -- 2.25.4
Re: [PATCH] Replace HTTP links with HTTPS ones: kprobes
Masami Hiramatsu wrote: On Tue, 14 Jul 2020 00:02:49 +0200 "Alexander A. Klimov" wrote: Am 13.07.20 um 16:20 schrieb Masami Hiramatsu: > Hi Naveen and Alexander, > > On Fri, 10 Jul 2020 19:14:47 +0530 > "Naveen N. Rao" wrote: > >> Masami Hiramatsu wrote: >>> On Tue, 7 Jul 2020 21:49:59 +0200 >>> "Alexander A. Klimov" wrote: >>> >>>> Rationale: >>>> Reduces attack surface on kernel devs opening the links for MITM >>>> as HTTPS traffic is much harder to manipulate. >>>> >>>> Deterministic algorithm: >>>> For each file: >>>>If not .svg: >>>> For each line: >>>>If doesn't contain `\bxmlns\b`: >>>> For each link, `\bhttp://[^# \t\r\n]*(?:\w|/)`: >>>>If both the HTTP and HTTPS versions >>>>return 200 OK and serve the same content: >>>> Replace HTTP with HTTPS. >>> >>> OK, but it seems that some of them are disappeared :( >>> >>> http://www-106.ibm.com/developerworks/library/l-kprobes.html?ca=dgr-lnxw42Kprobe >>> >>> -> https://www.ibm.com/developerworks/library/l-kprobes/index.html >> >> That looks right. >> >>> >>> http://www.redhat.com/magazine/005mar05/features/kprobes/ >>> >>> -> I can not find that. >> >> Ditto, we should drop that. >> >>> >>>> - http://www-users.cs.umn.edu/~boutcher/kprobes/ >>>> - http://www.linuxsymposium.org/2006/linuxsymposium_procv2.pdf (pages 101-115) >>> >>> Both are not found. >> >> It looks like the first link is gone, but there seems to be a copy in >> the web archive: >> https://web.archive.org/web/20061106154519/http://www-users.cs.umn.edu/~boutcher/kprobes/ >> >> I suppose we can drop that link. >> >>> >>> (OT, it seems http://www.linuxsymposium.org/ has been left from historical >>> Linux Symposium, we must remove it asap) >> >> Indeed, I think that link pointed to the Kprobes paper: >> https://www.kernel.org/doc/ols/2006/ols2006v2-pages-109-124.pdf > > Ah, there is. > Thank you for the confirmation. > Alexander, can you update other urls instead of just replacing the http with https? Sry, but I don't steal others' work (on principle). If I didn't the work (e.g. searched the replacement URL), I don't deserve to author the respective commit. Alexander, Next time, please ask if you aren't sure about including others' suggestions -- no need to term it as "stealing". Masami asked if you can include this, and I shared what I thought are the correct URLs so that they can be included. We don't mind someone else doing this change. Besides, there are ways to acknowledge others, through a Suggested-by tag, as an example. Also my HTTPSifying task is not done yet. Hmm, Naveen, then, can you make the update? Sure, I will send a patch. - Naveen
Re: [PATCH] Replace HTTP links with HTTPS ones: kprobes
Masami Hiramatsu wrote: On Tue, 7 Jul 2020 21:49:59 +0200 "Alexander A. Klimov" wrote: Rationale: Reduces attack surface on kernel devs opening the links for MITM as HTTPS traffic is much harder to manipulate. Deterministic algorithm: For each file: If not .svg: For each line: If doesn't contain `\bxmlns\b`: For each link, `\bhttp://[^# \t\r\n]*(?:\w|/)`: If both the HTTP and HTTPS versions return 200 OK and serve the same content: Replace HTTP with HTTPS. OK, but it seems that some of them are disappeared :( http://www-106.ibm.com/developerworks/library/l-kprobes.html?ca=dgr-lnxw42Kprobe -> https://www.ibm.com/developerworks/library/l-kprobes/index.html That looks right. http://www.redhat.com/magazine/005mar05/features/kprobes/ -> I can not find that. Ditto, we should drop that. - http://www-users.cs.umn.edu/~boutcher/kprobes/ - http://www.linuxsymposium.org/2006/linuxsymposium_procv2.pdf (pages 101-115) Both are not found. It looks like the first link is gone, but there seems to be a copy in the web archive: https://web.archive.org/web/20061106154519/http://www-users.cs.umn.edu/~boutcher/kprobes/ I suppose we can drop that link. (OT, it seems http://www.linuxsymposium.org/ has been left from historical Linux Symposium, we must remove it asap) Indeed, I think that link pointed to the Kprobes paper: https://www.kernel.org/doc/ols/2006/ols2006v2-pages-109-124.pdf - Naveen
Re: [PATCH] perf powerpc: Don't ignore sym-handling.c file
Arnaldo Carvalho de Melo wrote: Em Mon, May 11, 2020 at 11:45:09PM +0530, Sandipan Das escreveu: On 09/05/20 4:51 pm, Ravi Bangoria wrote: > Commit 7eec00a74720 ("perf symbols: Consolidate symbol fixup issue") > removed powerpc specific sym-handling.c file from Build. This wasn't > caught by build CI because all functions in this file are declared > as __weak in common code. Fix it. > > Fixes: 7eec00a74720 ("perf symbols: Consolidate symbol fixup issue") > Reported-by: Sandipan Das > Signed-off-by: Ravi Bangoria > --- > tools/perf/arch/powerpc/util/Build | 1 + > 1 file changed, 1 insertion(+) > > diff --git a/tools/perf/arch/powerpc/util/Build b/tools/perf/arch/powerpc/util/Build > index e5c9504f8586..e86e210bf514 100644 > --- a/tools/perf/arch/powerpc/util/Build > +++ b/tools/perf/arch/powerpc/util/Build > @@ -2,6 +2,7 @@ perf-y += header.o > perf-y += kvm-stat.o > perf-y += perf_regs.o > perf-y += mem-events.o > +perf-y += sym-handling.o > > perf-$(CONFIG_DWARF) += dwarf-regs.o > perf-$(CONFIG_DWARF) += skip-callchain-idx.o > Thanks for fixing this! Acked-by: Sandipan Das Leo, Naveen, can you comment on this? Shoot -- this is a bad miss, I should have caught it. FWIW: Reviewed-by: Naveen N. Rao Thanks, Naveen
Re: [PATCH 12/14] docs: move remaining stuff under Documentation/*.txt to Documentation/staging
Masami Hiramatsu wrote: On Fri, 1 May 2020 17:37:56 +0200 Mauro Carvalho Chehab wrote: There are several files that I was unable to find a proper place for them, and 3 ones that are still in plain old text format. Let's place those stuff behind the carpet, as we'd like to keep the root directory clean. We can later discuss and move those into better places. Hi Mauro, Thanks for cleaning it up! Tentatively moving kprobes.txt under staging/ is good to me. Acked-by: Masami Hiramatsu BTW, I think kprobes.txt is under trace/ or we may be better making a new core-api/events/ directory and prepare other event systems (PMU, uprobes, and hw_breakpoint.) I think it would be good to move kprobes.txt under trace/ -- all other tracing bits are already present there, including uprobes. - Naveen
Re: [PATCH 0/2] pseries/hotplug: Change the default behaviour of cede_offline
Michael Ellerman wrote: "Naveen N. Rao" writes: Michael Ellerman wrote: "Gautham R. Shenoy" writes: From: "Gautham R. Shenoy" Currently on Pseries Linux Guests, the offlined CPU can be put to one of the following two states: - Long term processor cede (also called extended cede) - Returned to the Hypervisor via RTAS "stop-self" call. This is controlled by the kernel boot parameter "cede_offline=on/off". By default the offlined CPUs enter extended cede. Since commit 3aa565f53c39 ("powerpc/pseries: Add hooks to put the CPU into an appropriate offline state") (Nov 2009) Which you wrote :) Why was that wrong? The PHYP hypervisor considers CPUs in extended cede to be "active" since the CPUs are still under the control fo the Linux Guests. Hence, when we change the SMT modes by offlining the secondary CPUs, the PURR and the RWMR SPRs will continue to count the values for offlined CPUs in extended cede as if they are online. One of the expectations with PURR is that the for an interval of time, the sum of the PURR increments across the online CPUs of a core should equal the number of timebase ticks for that interval. This is currently not the case. But why does that matter? It's just some accounting stuff, does it actually break something meaningful? Yes, this broke lparstat at the very least (though its quite unfortunate we took so long to notice). By "so long" you mean 10 years? Also I've never heard of lparstat, but I assume it's one of these tools that's meant to behave like the AIX equivalent? Yes, and yes. lparstat is part of powerpc-utils. If it's been "broken" for 10 years and no one noticed, I'd argue the current behaviour is now "correct" and fixing it would actually be a breakage :) :) More on this below... With SMT disabled, and under load: $ sudo lparstat 1 10 System Configuration type=Shared mode=Uncapped smt=Off lcpu=2 mem=7759616 kB cpus=6 ent=1.00 %user %sys %wait%idlephysc %entc lbusy vcsw phint - - --- - - - - 100.00 0.00 0.00 0.00 1.10 110.00 100.00 128784460 0 100.00 0.00 0.00 0.00 1.07 107.00 100.00 128784860 0 100.00 0.00 0.00 0.00 1.07 107.00 100.00 128785260 0 100.00 0.00 0.00 0.00 1.07 107.00 100.00 128785662 0 100.00 0.00 0.00 0.00 1.07 107.00 100.00 128786062 0 100.00 0.00 0.00 0.00 1.07 107.00 100.00 128786462 0 100.00 0.00 0.00 0.00 1.07 107.00 100.00 128786862 0 100.00 0.00 0.00 0.00 1.07 107.00 100.00 128787262 0 100.00 0.00 0.00 0.00 1.07 107.00 100.00 128787664 0 100.00 0.00 0.00 0.00 1.07 107.00 100.00 128788064 0 What about that is wrong? The 'physc' column represents cpu usage in units of physical cores. With 2 virtual cores ('lcpu=2') in uncapped, shared processor mode, we expect this to be closer to 2 when fully loaded (and spare capacity elsewhere in the system). With cede_offline=off: $ sudo lparstat 1 10 System Configuration type=Shared mode=Uncapped smt=Off lcpu=2 mem=7759616 kB cpus=6 ent=1.00 %user %sys %wait%idlephysc %entc lbusy vcsw phint - - --- - - - - 100.00 0.00 0.00 0.00 1.94 194.00 100.00 128961588 0 100.00 0.00 0.00 0.00 1.91 191.00 100.00 128961988 0 100.00 0.00 0.00 0.00 inf inf 100.00 128962392 0 100.00 0.00 0.00 0.00 1.91 191.00 100.00 128962792 0 100.00 0.00 0.00 0.00 1.91 191.00 100.00 128963192 0 100.00 0.00 0.00 0.00 1.91 191.00 100.00 128963592 0 100.00 0.00 0.00 0.00 1.91 191.00 100.00 128963992 0 100.00 0.00 0.00 0.00 1.91 191.00 100.00 128964392 0 100.00 0.00 0.00 0.00 1.91 191.00 100.00 128964792 0 100.00 0.00 0.00 0.00 1.91 191.00 100.00 128965194 0 [The 'inf' values there show a different bug] Also, since we expose [S]PURR through sysfs, any tools that make use of that directly are also affected due to this. But again if we've had the current behaviour for 10 years then arguably that's now the correct behaviour. That's a fair point, and probably again points to this area getting less tested. One of the main reasons for this being caught now though, is that there are workloads being tested under lower SMT levels now. So, I suspect no one has been relying on this behavior and we can consider this to be a bug. Thanks, Naveen
Re: [PATCH 0/2] pseries/hotplug: Change the default behaviour of cede_offline
Michael Ellerman wrote: "Gautham R. Shenoy" writes: From: "Gautham R. Shenoy" Currently on Pseries Linux Guests, the offlined CPU can be put to one of the following two states: - Long term processor cede (also called extended cede) - Returned to the Hypervisor via RTAS "stop-self" call. This is controlled by the kernel boot parameter "cede_offline=on/off". By default the offlined CPUs enter extended cede. Since commit 3aa565f53c39 ("powerpc/pseries: Add hooks to put the CPU into an appropriate offline state") (Nov 2009) Which you wrote :) Why was that wrong? The PHYP hypervisor considers CPUs in extended cede to be "active" since the CPUs are still under the control fo the Linux Guests. Hence, when we change the SMT modes by offlining the secondary CPUs, the PURR and the RWMR SPRs will continue to count the values for offlined CPUs in extended cede as if they are online. One of the expectations with PURR is that the for an interval of time, the sum of the PURR increments across the online CPUs of a core should equal the number of timebase ticks for that interval. This is currently not the case. But why does that matter? It's just some accounting stuff, does it actually break something meaningful? Yes, this broke lparstat at the very least (though its quite unfortunate we took so long to notice). With SMT disabled, and under load: $ sudo lparstat 1 10 System Configuration type=Shared mode=Uncapped smt=Off lcpu=2 mem=7759616 kB cpus=6 ent=1.00 %user %sys %wait%idlephysc %entc lbusy vcsw phint - - --- - - - - 100.00 0.00 0.00 0.00 1.10 110.00 100.00 128784460 0 100.00 0.00 0.00 0.00 1.07 107.00 100.00 128784860 0 100.00 0.00 0.00 0.00 1.07 107.00 100.00 128785260 0 100.00 0.00 0.00 0.00 1.07 107.00 100.00 128785662 0 100.00 0.00 0.00 0.00 1.07 107.00 100.00 128786062 0 100.00 0.00 0.00 0.00 1.07 107.00 100.00 128786462 0 100.00 0.00 0.00 0.00 1.07 107.00 100.00 128786862 0 100.00 0.00 0.00 0.00 1.07 107.00 100.00 128787262 0 100.00 0.00 0.00 0.00 1.07 107.00 100.00 128787664 0 100.00 0.00 0.00 0.00 1.07 107.00 100.00 128788064 0 With cede_offline=off: $ sudo lparstat 1 10 System Configuration type=Shared mode=Uncapped smt=Off lcpu=2 mem=7759616 kB cpus=6 ent=1.00 %user %sys %wait%idlephysc %entc lbusy vcsw phint - - --- - - - - 100.00 0.00 0.00 0.00 1.94 194.00 100.00 128961588 0 100.00 0.00 0.00 0.00 1.91 191.00 100.00 128961988 0 100.00 0.00 0.00 0.00 inf inf 100.00 128962392 0 100.00 0.00 0.00 0.00 1.91 191.00 100.00 128962792 0 100.00 0.00 0.00 0.00 1.91 191.00 100.00 128963192 0 100.00 0.00 0.00 0.00 1.91 191.00 100.00 128963592 0 100.00 0.00 0.00 0.00 1.91 191.00 100.00 128963992 0 100.00 0.00 0.00 0.00 1.91 191.00 100.00 128964392 0 100.00 0.00 0.00 0.00 1.91 191.00 100.00 128964792 0 100.00 0.00 0.00 0.00 1.91 191.00 100.00 128965194 0 [The 'inf' values there show a different bug] Also, since we expose [S]PURR through sysfs, any tools that make use of that directly are also affected due to this. - Naveen
Re: [PATCH 2/2] powerpc/watchpoint: Disable watchpoint hit by larx/stcx instructions
Ravi Bangoria wrote: If watchpoint exception is generated by larx/stcx instructions, the reservation created by larx gets lost while handling exception, and thus stcx instruction always fails. Generally these instructions are used in a while(1) loop, for example spinlocks. And because stcx never succeeds, it loops forever and ultimately hangs the system. Note that ptrace anyway works in one-shot mode and thus for ptrace we don't change the behaviour. It's up to ptrace user to take care of this. Signed-off-by: Ravi Bangoria --- arch/powerpc/kernel/hw_breakpoint.c | 49 +++-- 1 file changed, 33 insertions(+), 16 deletions(-) diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c index 28ad3171bb82..9fa496a598ce 100644 --- a/arch/powerpc/kernel/hw_breakpoint.c +++ b/arch/powerpc/kernel/hw_breakpoint.c @@ -195,14 +195,32 @@ void thread_change_pc(struct task_struct *tsk, struct pt_regs *regs) tsk->thread.last_hit_ubp = NULL; } +static bool is_larx_stcx_instr(struct pt_regs *regs, unsigned int instr) +{ + int ret, type; + struct instruction_op op; + + ret = analyse_instr(, regs, instr); + type = GETTYPE(op.type); + return (!ret && (type == LARX || type == STCX)); +} + /* * Handle debug exception notifications. */ static bool stepping_handler(struct pt_regs *regs, struct perf_event *bp, unsigned long addr) { - int stepped; - unsigned int instr; + unsigned int instr = 0; + + if (__get_user_inatomic(instr, (unsigned int *)regs->nip)) + goto fail; + + if (is_larx_stcx_instr(regs, instr)) { + printk_ratelimited("Watchpoint: Can't emulate/single-step larx/" + "stcx instructions. Disabling watchpoint.\n"); The below WARN() uses the term 'breakpoint'. Better to use consistent terminology. I would rewrite the above as: printk_ratelimited("Breakpoint hit on instruction that can't be emulated. " "Breakpoint at 0x%lx will be disabled.\n", addr); Otherwise: Acked-by: Naveen N. Rao - Naveen + goto disable; + } /* Do not emulate user-space instructions, instead single-step them */ if (user_mode(regs)) { @@ -211,23 +229,22 @@ static bool stepping_handler(struct pt_regs *regs, struct perf_event *bp, return false; } - stepped = 0; - instr = 0; - if (!__get_user_inatomic(instr, (unsigned int *)regs->nip)) - stepped = emulate_step(regs, instr); + if (!emulate_step(regs, instr)) + goto fail; + return true; + +fail: /* -* emulate_step() could not execute it. We've failed in reliably -* handling the hw-breakpoint. Unregister it and throw a warning -* message to let the user know about it. +* We've failed in reliably handling the hw-breakpoint. Unregister +* it and throw a warning message to let the user know about it. */ - if (!stepped) { - WARN(1, "Unable to handle hardware breakpoint. Breakpoint at " - "0x%lx will be disabled.", addr); - perf_event_disable_inatomic(bp); - return false; - } - return true; + WARN(1, "Unable to handle hardware breakpoint. Breakpoint at " + "0x%lx will be disabled.", addr); + +disable: + perf_event_disable_inatomic(bp); + return false; } int hw_breakpoint_handler(struct die_args *args) -- 2.21.0
[PATCH 3/3] powerpc: Use ftrace_graph_ret_addr() when unwinding
With support for HAVE_FUNCTION_GRAPH_RET_ADDR_PTR, ftrace_graph_ret_addr() provides more robust unwinding when function graph is in use. Update show_stack() to use the same. With dump_stack() added to sysrq_sysctl_handler(), before this patch: root@(none):/sys/kernel/debug/tracing# cat /proc/sys/kernel/sysrq CPU: 0 PID: 218 Comm: cat Not tainted 5.3.0-rc7-00868-g8453ad4a078c-dirty #20 Call Trace: [c000d1e13c30] [c006ab98] return_to_handler+0x0/0x40 (dump_stack+0xe8/0x164) (unreliable) [c000d1e13c80] [c0145680] sysrq_sysctl_handler+0x48/0xb8 [c000d1e13cd0] [c006ab98] return_to_handler+0x0/0x40 (proc_sys_call_handler+0x274/0x2a0) [c000d1e13d60] [c006ab98] return_to_handler+0x0/0x40 (return_to_handler+0x0/0x40) [c000d1e13d80] [c006ab98] return_to_handler+0x0/0x40 (__vfs_read+0x3c/0x70) [c000d1e13dd0] [c006ab98] return_to_handler+0x0/0x40 (vfs_read+0xb8/0x1b0) [c000d1e13e20] [c006ab98] return_to_handler+0x0/0x40 (ksys_read+0x7c/0x140) After this patch: Call Trace: [c000d1e33c30] [c006ab58] return_to_handler+0x0/0x40 (dump_stack+0xe8/0x164) (unreliable) [c000d1e33c80] [c0145680] sysrq_sysctl_handler+0x48/0xb8 [c000d1e33cd0] [c006ab58] return_to_handler+0x0/0x40 (proc_sys_call_handler+0x274/0x2a0) [c000d1e33d60] [c006ab58] return_to_handler+0x0/0x40 (__vfs_read+0x3c/0x70) [c000d1e33d80] [c006ab58] return_to_handler+0x0/0x40 (vfs_read+0xb8/0x1b0) [c000d1e33dd0] [c006ab58] return_to_handler+0x0/0x40 (ksys_read+0x7c/0x140) [c000d1e33e20] [c006ab58] return_to_handler+0x0/0x40 (system_call+0x5c/0x68) Signed-off-by: Naveen N. Rao --- arch/powerpc/kernel/process.c | 19 ++- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 24621e7e5033..f289bdd2b562 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -2047,10 +2047,8 @@ void show_stack(struct task_struct *tsk, unsigned long *stack) int count = 0; int firstframe = 1; #ifdef CONFIG_FUNCTION_GRAPH_TRACER - struct ftrace_ret_stack *ret_stack; - extern void return_to_handler(void); - unsigned long rth = (unsigned long)return_to_handler; - int curr_frame = 0; + unsigned long ret_addr; + int ftrace_idx = 0; #endif if (tsk == NULL) @@ -2079,15 +2077,10 @@ void show_stack(struct task_struct *tsk, unsigned long *stack) if (!firstframe || ip != lr) { printk("["REG"] ["REG"] %pS", sp, ip, (void *)ip); #ifdef CONFIG_FUNCTION_GRAPH_TRACER - if ((ip == rth) && curr_frame >= 0) { - ret_stack = ftrace_graph_get_ret_stack(current, - curr_frame++); - if (ret_stack) - pr_cont(" (%pS)", - (void *)ret_stack->ret); - else - curr_frame = -1; - } + ret_addr = ftrace_graph_ret_addr(current, + _idx, ip, stack); + if (ret_addr != ip) + pr_cont(" (%pS)", (void *)ret_addr); #endif if (firstframe) pr_cont(" (unreliable)"); -- 2.23.0
[PATCH 2/3] powerpc/ftrace: Enable HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
This associates entries in the ftrace_ret_stack with corresponding stack frames, enabling more robust stack unwinding. Also update the only user of ftrace_graph_ret_addr() to pass the stack pointer. Signed-off-by: Naveen N. Rao --- arch/powerpc/include/asm/asm-prototypes.h | 3 ++- arch/powerpc/include/asm/ftrace.h | 2 ++ arch/powerpc/kernel/stacktrace.c | 2 +- arch/powerpc/kernel/trace/ftrace.c | 5 +++-- arch/powerpc/kernel/trace/ftrace_32.S | 1 + arch/powerpc/kernel/trace/ftrace_64_mprofile.S | 1 + arch/powerpc/kernel/trace/ftrace_64_pg.S | 1 + 7 files changed, 11 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h index 49196d35e3bb..8561498e653c 100644 --- a/arch/powerpc/include/asm/asm-prototypes.h +++ b/arch/powerpc/include/asm/asm-prototypes.h @@ -134,7 +134,8 @@ extern int __ucmpdi2(u64, u64); /* tracing */ void _mcount(void); -unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip); +unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip, + unsigned long sp); void pnv_power9_force_smt4_catch(void); void pnv_power9_force_smt4_release(void); diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h index 3dfb80b86561..f54a08a2cd70 100644 --- a/arch/powerpc/include/asm/ftrace.h +++ b/arch/powerpc/include/asm/ftrace.h @@ -8,6 +8,8 @@ #define MCOUNT_ADDR((unsigned long)(_mcount)) #define MCOUNT_INSN_SIZE 4 /* sizeof mcount call */ +#define HAVE_FUNCTION_GRAPH_RET_ADDR_PTR + #ifdef __ASSEMBLY__ /* Based off of objdump optput from glibc */ diff --git a/arch/powerpc/kernel/stacktrace.c b/arch/powerpc/kernel/stacktrace.c index 1e2276963f6d..e2a46cfed5fd 100644 --- a/arch/powerpc/kernel/stacktrace.c +++ b/arch/powerpc/kernel/stacktrace.c @@ -182,7 +182,7 @@ static int __save_stack_trace_tsk_reliable(struct task_struct *tsk, * FIXME: IMHO these tests do not belong in * arch-dependent code, they are generic. */ - ip = ftrace_graph_ret_addr(tsk, _idx, ip, NULL); + ip = ftrace_graph_ret_addr(tsk, _idx, ip, stack); #ifdef CONFIG_KPROBES /* * Mark stacktraces with kretprobed functions on them diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c index be1ca98fce5c..7ea0ca044b65 100644 --- a/arch/powerpc/kernel/trace/ftrace.c +++ b/arch/powerpc/kernel/trace/ftrace.c @@ -944,7 +944,8 @@ int ftrace_disable_ftrace_graph_caller(void) * Hook the return address and push it in the stack of return addrs * in current thread info. Return the address we want to divert to. */ -unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip) +unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip, + unsigned long sp) { unsigned long return_hooker; @@ -956,7 +957,7 @@ unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip) return_hooker = ppc_function_entry(return_to_handler); - if (!function_graph_enter(parent, ip, 0, NULL)) + if (!function_graph_enter(parent, ip, 0, (unsigned long *)sp)) parent = return_hooker; out: return parent; diff --git a/arch/powerpc/kernel/trace/ftrace_32.S b/arch/powerpc/kernel/trace/ftrace_32.S index 183f608efb81..e023ae59c429 100644 --- a/arch/powerpc/kernel/trace/ftrace_32.S +++ b/arch/powerpc/kernel/trace/ftrace_32.S @@ -50,6 +50,7 @@ _GLOBAL(ftrace_stub) #ifdef CONFIG_FUNCTION_GRAPH_TRACER _GLOBAL(ftrace_graph_caller) + addir5, r1, 48 /* load r4 with local address */ lwz r4, 44(r1) subir4, r4, MCOUNT_INSN_SIZE diff --git a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S index 74acbf16a666..f9fd5f743eba 100644 --- a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S +++ b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S @@ -294,6 +294,7 @@ _GLOBAL(ftrace_graph_caller) std r2, 24(r1) ld r2, PACATOC(r13)/* get kernel TOC in r2 */ + addir5, r1, 112 mfctr r4 /* ftrace_caller has moved local addr here */ std r4, 40(r1) mflrr3 /* ftrace_caller has restored LR from stack */ diff --git a/arch/powerpc/kernel/trace/ftrace_64_pg.S b/arch/powerpc/kernel/trace/ftrace_64_pg.S index e41a7d13c99c..6708e24db0ab 100644 --- a/arch/powerpc/kernel/trace/ftrace_64_pg.S +++ b/arch/powerpc/kernel/trace/ftrace_64_pg.S @@ -41,6 +41,7 @@ _GLOBAL(ftrace_stub) #ifdef CONFIG_FUNCTION_GRAPH_TRACER _GLOBAL(ftrace_graph_caller) + addir5, r1, 112 /* load r4 with local address */ ld r4, 128(r1) subir4
[PATCH 0/3] powerpc/ftrace: Enable HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
Enable HAVE_FUNCTION_GRAPH_RET_ADDR_PTR for more robust stack unwinding when function graph tracer is in use. Convert powerpc show_stack() to use ftrace_graph_ret_addr() for better stack unwinding. - Naveen Naveen N. Rao (3): ftrace: Look up the address of return_to_handler() using helpers powerpc/ftrace: Enable HAVE_FUNCTION_GRAPH_RET_ADDR_PTR powerpc: Use ftrace_graph_ret_addr() when unwinding arch/powerpc/include/asm/asm-prototypes.h | 3 ++- arch/powerpc/include/asm/ftrace.h | 2 ++ arch/powerpc/kernel/process.c | 19 ++- arch/powerpc/kernel/stacktrace.c | 2 +- arch/powerpc/kernel/trace/ftrace.c| 5 +++-- arch/powerpc/kernel/trace/ftrace_32.S | 1 + .../powerpc/kernel/trace/ftrace_64_mprofile.S | 1 + arch/powerpc/kernel/trace/ftrace_64_pg.S | 1 + kernel/trace/fgraph.c | 4 ++-- 9 files changed, 19 insertions(+), 19 deletions(-) -- 2.23.0
[PATCH 1/3] ftrace: Look up the address of return_to_handler() using helpers
This ensures that we use the right address on architectures that use function descriptors. Signed-off-by: Naveen N. Rao --- kernel/trace/fgraph.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 8dfd5021b933..7950a0356042 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -276,7 +276,7 @@ unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, int index = task->curr_ret_stack; int i; - if (ret != (unsigned long)return_to_handler) + if (ret != (unsigned long)dereference_kernel_function_descriptor(return_to_handler)) return ret; if (index < 0) @@ -294,7 +294,7 @@ unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, { int task_idx; - if (ret != (unsigned long)return_to_handler) + if (ret != (unsigned long)dereference_kernel_function_descriptor(return_to_handler)) return ret; task_idx = task->curr_ret_stack; -- 2.23.0
Re: [PATCH v3 2/3] Powerpc64/Watchpoint: Don't ignore extraneous exceptions
Ravi Bangoria wrote: On Powerpc64, watchpoint match range is double-word granular. On a watchpoint hit, DAR is set to the first byte of overlap between actual access and watched range. And thus it's quite possible that DAR does not point inside user specified range. Ex, say user creates a watchpoint with address range 0x1004 to 0x1007. So hw would be configured to watch from 0x1000 to 0x1007. If there is a 4 byte access from 0x1002 to 0x1005, DAR will point to 0x1002 and thus interrupt handler considers it as extraneous, but it's actually not, because part of the access belongs to what user has asked. So, let kernel pass it on to user and let user decide what to do with it instead of silently ignoring it. The drawback is, it can generate false positive events. I think you should do the additional validation here, instead of generating false positives. You should be able to read the instruction, run it through analyse_instr(), and then use OP_IS_LOAD_STORE() and GETSIZE() to understand the access range. This can be used to then perform a better match against what the user asked for. - Naveen
Re: [PATCH -tip v2] kprobes: Prohibit probing on BUG() and WARN() address
Masami Hiramatsu wrote: Since BUG() and WARN() may use a trap (e.g. UD2 on x86) to get the address where the BUG() has occurred, kprobes can not do single-step out-of-line that instruction. So prohibit probing on such address. Without this fix, if someone put a kprobe on WARN(), the kernel will crash with invalid opcode error instead of outputing warning message, because kernel can not find correct bug address. Signed-off-by: Masami Hiramatsu --- Changes in v2: - Add find_bug() stub function for !CONFIG_GENERIC_BUG - Cast the p->addr to unsigned long. --- include/linux/bug.h |5 + kernel/kprobes.c|3 ++- 2 files changed, 7 insertions(+), 1 deletion(-) Acked-by: Naveen N. Rao - Naveen
Re: [PATCH 1/2] ftrace: Fix NULL pointer dereference in t_probe_next()
Steven Rostedt wrote: On Thu, 4 Jul 2019 20:04:41 +0530 "Naveen N. Rao" wrote: kernel/trace/ftrace.c | 4 1 file changed, 4 insertions(+) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 7b037295a1f1..0791eafb693d 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3093,6 +3093,10 @@ t_probe_next(struct seq_file *m, loff_t *pos) hnd = >probe_entry->hlist; hash = iter->probe->ops.func_hash->filter_hash; + + if (!hash) + return NULL; + size = 1 << hash->size_bits; retry: OK, I added this, but I'm also adding this on top: Thanks, the additional comments do make this much clearer. Regards, Naveen
[tip: perf/core] perf arch powerpc: Sync powerpc syscall.tbl
The following commit has been merged into the perf/core branch of tip: Commit-ID: 0a56e0603fa13af08816d673f6f71b68cda2fb2e Gitweb: https://git.kernel.org/tip/0a56e0603fa13af08816d673f6f71b68cda2fb2e Author:Naveen N. Rao AuthorDate:Tue, 27 Aug 2019 12:44:58 +05:30 Committer: Arnaldo Carvalho de Melo CommitterDate: Wed, 28 Aug 2019 10:25:38 -03:00 perf arch powerpc: Sync powerpc syscall.tbl Copy over powerpc syscall.tbl to grab changes from the below commits: commit cee3536d24a1 ("powerpc: Wire up clone3 syscall") commit 1a271a68e030 ("arch: mark syscall number 435 reserved for clone3") commit 7615d9e1780e ("arch: wire-up pidfd_open()") commit d8076bdb56af ("uapi: Wire up the mount API syscalls on non-x86 arches [ver #2]") commit 39036cd27273 ("arch: add pidfd and io_uring syscalls everywhere") commit 48166e6ea47d ("y2038: add 64-bit time_t syscalls to all 32-bit architectures") commit d33c577cccd0 ("y2038: rename old time and utime syscalls") commit 00bf25d693e7 ("y2038: use time32 syscall names on 32-bit") commit 8dabe7245bbc ("y2038: syscalls: rename y2038 compat syscalls") commit 0d6040d46817 ("arch: add split IPC system calls where needed") Reported-by: Nicholas Piggin Signed-off-by: Naveen N. Rao Cc: Ravi Bangoria Cc: linuxppc-...@lists.ozlabs.org Link: http://lkml.kernel.org/r/20190827071458.19897-1-naveen.n@linux.vnet.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/powerpc/entry/syscalls/syscall.tbl | 146 +--- 1 file changed, 119 insertions(+), 27 deletions(-) diff --git a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl index db3bbb8..43f736e 100644 --- a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl @@ -20,7 +20,9 @@ 10 common unlink sys_unlink 11 nospu execve sys_execve compat_sys_execve 12 common chdir sys_chdir -13 common timesys_time compat_sys_time +13 32 timesys_time32 +13 64 timesys_time +13 spu timesys_time 14 common mknod sys_mknod 15 common chmod sys_chmod 16 common lchown sys_lchown @@ -36,14 +38,17 @@ 22 spu umount sys_ni_syscall 23 common setuid sys_setuid 24 common getuid sys_getuid -25 common stime sys_stime compat_sys_stime +25 32 stime sys_stime32 +25 64 stime sys_stime +25 spu stime sys_stime 26 nospu ptrace sys_ptrace compat_sys_ptrace 27 common alarm sys_alarm 28 32 oldfstatsys_fstat sys_ni_syscall 28 64 oldfstatsys_ni_syscall 28 spu oldfstatsys_ni_syscall 29 nospu pause sys_pause -30 nospu utime sys_utime compat_sys_utime +30 32 utime sys_utime32 +30 64 utime sys_utime 31 common sttysys_ni_syscall 32 common gttysys_ni_syscall 33 common access sys_access @@ -157,7 +162,9 @@ 121common setdomainname sys_setdomainname 122common uname sys_newuname 123common modify_ldt sys_ni_syscall -124common adjtimexsys_adjtimex compat_sys_adjtimex +12432 adjtimexsys_adjtimex_time32 +12464 adjtimexsys_adjtimex +124spu adjtimexsys_adjtimex 125common mprotectsys_mprotect 12632 sigprocmask sys_sigprocmask compat_sys_sigprocmask 12664 sigprocmask sys_ni_syscall @@ -198,8 +205,12 @@ 158common sched_yield sys_sched_yield 159common sched_get_priority_max sys_sched_get_priority_max 160common sched_get_priority_min sys_sched_get_priority_min -161common sched_rr_get_interval sys_sched_rr_get_interval
[PATCH] perf arch powerpc: Sync powerpc syscall.tbl
Copy over powerpc syscall.tbl to grab changes from the below commits: commit cee3536d24a1 ("powerpc: Wire up clone3 syscall") commit 1a271a68e030 ("arch: mark syscall number 435 reserved for clone3") commit 7615d9e1780e ("arch: wire-up pidfd_open()") commit d8076bdb56af ("uapi: Wire up the mount API syscalls on non-x86 arches [ver #2]") commit 39036cd27273 ("arch: add pidfd and io_uring syscalls everywhere") commit 48166e6ea47d ("y2038: add 64-bit time_t syscalls to all 32-bit architectures") commit d33c577cccd0 ("y2038: rename old time and utime syscalls") commit 00bf25d693e7 ("y2038: use time32 syscall names on 32-bit") commit 8dabe7245bbc ("y2038: syscalls: rename y2038 compat syscalls") commit 0d6040d46817 ("arch: add split IPC system calls where needed") Reported-by: Nicholas Piggin Signed-off-by: Naveen N. Rao --- .../arch/powerpc/entry/syscalls/syscall.tbl | 146 ++ 1 file changed, 119 insertions(+), 27 deletions(-) diff --git a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl index db3bbb8744af..43f736ed47f2 100644 --- a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl @@ -20,7 +20,9 @@ 10 common unlink sys_unlink 11 nospu execve sys_execve compat_sys_execve 12 common chdir sys_chdir -13 common timesys_time compat_sys_time +13 32 timesys_time32 +13 64 timesys_time +13 spu timesys_time 14 common mknod sys_mknod 15 common chmod sys_chmod 16 common lchown sys_lchown @@ -36,14 +38,17 @@ 22 spu umount sys_ni_syscall 23 common setuid sys_setuid 24 common getuid sys_getuid -25 common stime sys_stime compat_sys_stime +25 32 stime sys_stime32 +25 64 stime sys_stime +25 spu stime sys_stime 26 nospu ptrace sys_ptrace compat_sys_ptrace 27 common alarm sys_alarm 28 32 oldfstatsys_fstat sys_ni_syscall 28 64 oldfstatsys_ni_syscall 28 spu oldfstatsys_ni_syscall 29 nospu pause sys_pause -30 nospu utime sys_utime compat_sys_utime +30 32 utime sys_utime32 +30 64 utime sys_utime 31 common sttysys_ni_syscall 32 common gttysys_ni_syscall 33 common access sys_access @@ -157,7 +162,9 @@ 121common setdomainname sys_setdomainname 122common uname sys_newuname 123common modify_ldt sys_ni_syscall -124common adjtimexsys_adjtimex compat_sys_adjtimex +12432 adjtimexsys_adjtimex_time32 +12464 adjtimexsys_adjtimex +124spu adjtimexsys_adjtimex 125common mprotectsys_mprotect 12632 sigprocmask sys_sigprocmask compat_sys_sigprocmask 12664 sigprocmask sys_ni_syscall @@ -198,8 +205,12 @@ 158common sched_yield sys_sched_yield 159common sched_get_priority_max sys_sched_get_priority_max 160common sched_get_priority_min sys_sched_get_priority_min -161common sched_rr_get_interval sys_sched_rr_get_interval compat_sys_sched_rr_get_interval -162common nanosleep sys_nanosleep compat_sys_nanosleep +16132 sched_rr_get_interval sys_sched_rr_get_interval_time32 +16164 sched_rr_get_interval sys_sched_rr_get_interval +161spu sched_rr_get_interval sys_sched_rr_get_interval +16232 nanosleep sys_nanosleep_time32 +16264 nanosleep sys_nanosleep +162spu nanosleep sys_nanosleep 163common mremap
Re: [PATCH] bpf: handle 32-bit zext during constant blinding
Jiong Wang wrote: Naveen N. Rao writes: Since BPF constant blinding is performed after the verifier pass, the ALU32 instructions inserted for doubleword immediate loads don't have a corresponding zext instruction. This is causing a kernel oops on powerpc and can be reproduced by running 'test_cgroup_storage' with bpf_jit_harden=2. Fix this by emitting BPF_ZEXT during constant blinding if prog->aux->verifier_zext is set. Fixes: a4b1d3c1ddf6cb ("bpf: verifier: insert zero extension according to analysis result") Reported-by: Michael Ellerman Signed-off-by: Naveen N. Rao Thanks for the fix. Reviewed-by: Jiong Wang Just two other comments during review in case I am wrong on somewhere. - Use verifier_zext instead of bpf_jit_needs_zext() seems better, even though the latter could avoid extending function argument. Because JIT back-ends look at verifier_zext, true means zext inserted by verifier so JITs won't do the code-gen. Use verifier_zext is sort of keeping JIT blinding the same behaviour has verifier even though blinding doesn't belong to verifier, but for such insn patching, it could be seen as a extension of verifier, therefore use verifier_zext seems better than bpf_jit_needs_zext() to me. - JIT blinding is also escaping the HI32 randomization which happens inside verifier, otherwise x86-64 regression should have caught this issue. Jiong, Thanks for the review. Alexei, Daniel, Can you please pick this up for v5.3. This is a regression and is causing a crash on powerpc. - Naveen
[PATCH] bpf: handle 32-bit zext during constant blinding
Since BPF constant blinding is performed after the verifier pass, the ALU32 instructions inserted for doubleword immediate loads don't have a corresponding zext instruction. This is causing a kernel oops on powerpc and can be reproduced by running 'test_cgroup_storage' with bpf_jit_harden=2. Fix this by emitting BPF_ZEXT during constant blinding if prog->aux->verifier_zext is set. Fixes: a4b1d3c1ddf6cb ("bpf: verifier: insert zero extension according to analysis result") Reported-by: Michael Ellerman Signed-off-by: Naveen N. Rao --- Changes since RFC: - Removed changes to ALU32 and JMP32 ops since those don't alter program execution, and the verifier would have already accounted for them. kernel/bpf/core.c | 8 ++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 8191a7db2777..66088a9e9b9e 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -890,7 +890,8 @@ int bpf_jit_get_func_addr(const struct bpf_prog *prog, static int bpf_jit_blind_insn(const struct bpf_insn *from, const struct bpf_insn *aux, - struct bpf_insn *to_buff) + struct bpf_insn *to_buff, + bool emit_zext) { struct bpf_insn *to = to_buff; u32 imm_rnd = get_random_int(); @@ -1005,6 +1006,8 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from, case 0: /* Part 2 of BPF_LD | BPF_IMM | BPF_DW. */ *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[0].imm); *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); + if (emit_zext) + *to++ = BPF_ZEXT_REG(BPF_REG_AX); *to++ = BPF_ALU64_REG(BPF_OR, aux[0].dst_reg, BPF_REG_AX); break; @@ -1088,7 +1091,8 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) insn[1].code == 0) memcpy(aux, insn, sizeof(aux)); - rewritten = bpf_jit_blind_insn(insn, aux, insn_buff); + rewritten = bpf_jit_blind_insn(insn, aux, insn_buff, + clone->aux->verifier_zext); if (!rewritten) continue; -- 2.22.0
Re: Regression fix for bpf in v5.3 (was Re: [RFC PATCH] bpf: handle 32-bit zext during constant blinding)
Jiong Wang wrote: Michael Ellerman writes: "Naveen N. Rao" writes: Since BPF constant blinding is performed after the verifier pass, there are certain ALU32 instructions inserted which don't have a corresponding zext instruction inserted after. This is causing a kernel oops on powerpc and can be reproduced by running 'test_cgroup_storage' with bpf_jit_harden=2. Fix this by emitting BPF_ZEXT during constant blinding if prog->aux->verifier_zext is set. Fixes: a4b1d3c1ddf6cb ("bpf: verifier: insert zero extension according to analysis result") Reported-by: Michael Ellerman Signed-off-by: Naveen N. Rao --- This approach (the location where zext is being introduced below, in particular) works for powerpc, but I am not entirely sure if this is sufficient for other architectures as well. This is broken on v5.3-rc4. Any comment on this? Have commented on https://marc.info/?l=linux-netdev=156637836024743=2 The fix looks correct to me on "BPF_LD | BPF_IMM | BPF_DW", but looks unnecessary on two other places. It would be great if you or Naveen could confirm it. Jiong, Thanks for the review. I can now see why the other two changes are not necessary. I will post a follow-on patch. Thanks! - Naveen
Re: [RFC PATCH] bpf: handle 32-bit zext during constant blinding
Naveen N. Rao wrote: Since BPF constant blinding is performed after the verifier pass, there are certain ALU32 instructions inserted which don't have a corresponding zext instruction inserted after. This is causing a kernel oops on powerpc and can be reproduced by running 'test_cgroup_storage' with bpf_jit_harden=2. Fix this by emitting BPF_ZEXT during constant blinding if prog->aux->verifier_zext is set. Fixes: a4b1d3c1ddf6cb ("bpf: verifier: insert zero extension according to analysis result") Reported-by: Michael Ellerman Signed-off-by: Naveen N. Rao --- This approach (the location where zext is being introduced below, in particular) works for powerpc, but I am not entirely sure if this is sufficient for other architectures as well. This is broken on v5.3-rc4. Alexie, Daniel, Jiong, Any feedback on this? - Naveen
Re: [PATCH 13/16] include/asm-generic: prefer __section from compiler_attributes.h
Nick Desaulniers wrote: Reported-by: Sedat Dilek Suggested-by: Josh Poimboeuf Signed-off-by: Nick Desaulniers --- Acked-by: Naveen N. Rao - Naveen
Re: [PATCH 4/4] arm64: implement KPROBES_ON_FTRACE
Jisheng Zhang wrote: This patch implements KPROBES_ON_FTRACE for arm64. ~ # mount -t debugfs debugfs /sys/kernel/debug/ ~ # cd /sys/kernel/debug/ /sys/kernel/debug # echo 'p _do_fork' > tracing/kprobe_events before the patch: /sys/kernel/debug # cat kprobes/list ff801009ff7c k _do_fork+0x4[DISABLED] This looks wrong -- we should not be allowing kprobe to be registered on ftrace address without KPROBES_ON_FTRACE. Is _do_fork+0x4 the location of ftrace entry on arm64? - Naveen
Re: [PATCH 1/4] kprobes: adjust kprobe addr for KPROBES_ON_FTRACE
Jisheng Zhang wrote: For KPROBES_ON_FTRACE case, we need to adjust the kprobe's addr correspondingly. Signed-off-by: Jisheng Zhang --- kernel/kprobes.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 9873fc627d61..f8400753a8a9 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1560,6 +1560,9 @@ int register_kprobe(struct kprobe *p) addr = kprobe_addr(p); if (IS_ERR(addr)) return PTR_ERR(addr); +#ifdef CONFIG_KPROBES_ON_FTRACE + addr = (kprobe_opcode_t *)ftrace_call_adjust((unsigned long)addr); +#endif p->addr = addr; I'm not sure what this is achieving, but looks wrong to me. If you intend to have kprobes default to using ftrace entry for probing functions, consider over-riding kprobe_lookup_name() -- see powerpc variant for example. - Naveen
[RFC PATCH] bpf: handle 32-bit zext during constant blinding
Since BPF constant blinding is performed after the verifier pass, there are certain ALU32 instructions inserted which don't have a corresponding zext instruction inserted after. This is causing a kernel oops on powerpc and can be reproduced by running 'test_cgroup_storage' with bpf_jit_harden=2. Fix this by emitting BPF_ZEXT during constant blinding if prog->aux->verifier_zext is set. Fixes: a4b1d3c1ddf6cb ("bpf: verifier: insert zero extension according to analysis result") Reported-by: Michael Ellerman Signed-off-by: Naveen N. Rao --- This approach (the location where zext is being introduced below, in particular) works for powerpc, but I am not entirely sure if this is sufficient for other architectures as well. This is broken on v5.3-rc4. - Naveen kernel/bpf/core.c | 14 -- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 8191a7db2777..d84146e6fd9e 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -890,7 +890,8 @@ int bpf_jit_get_func_addr(const struct bpf_prog *prog, static int bpf_jit_blind_insn(const struct bpf_insn *from, const struct bpf_insn *aux, - struct bpf_insn *to_buff) + struct bpf_insn *to_buff, + bool emit_zext) { struct bpf_insn *to = to_buff; u32 imm_rnd = get_random_int(); @@ -939,6 +940,8 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from, *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); *to++ = BPF_ALU32_REG(from->code, from->dst_reg, BPF_REG_AX); + if (emit_zext) + *to++ = BPF_ZEXT_REG(from->dst_reg); break; case BPF_ALU64 | BPF_ADD | BPF_K: @@ -992,6 +995,10 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from, off -= 2; *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); + if (emit_zext) { + *to++ = BPF_ZEXT_REG(BPF_REG_AX); + off--; + } *to++ = BPF_JMP32_REG(from->code, from->dst_reg, BPF_REG_AX, off); break; @@ -1005,6 +1012,8 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from, case 0: /* Part 2 of BPF_LD | BPF_IMM | BPF_DW. */ *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[0].imm); *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); + if (emit_zext) + *to++ = BPF_ZEXT_REG(BPF_REG_AX); *to++ = BPF_ALU64_REG(BPF_OR, aux[0].dst_reg, BPF_REG_AX); break; @@ -1088,7 +1097,8 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) insn[1].code == 0) memcpy(aux, insn, sizeof(aux)); - rewritten = bpf_jit_blind_insn(insn, aux, insn_buff); + rewritten = bpf_jit_blind_insn(insn, aux, insn_buff, + clone->aux->verifier_zext); if (!rewritten) continue; -- 2.22.0
Re: [PATCH 0/2] ftrace: two fixes with func_probes handling
Naveen N. Rao wrote: Two patches addressing bugs in ftrace function probe handling. The first patch addresses a NULL pointer dereference reported by LTP tests, while the second one is a trivial patch to address a missing check for return value, found by code inspection. Steven, Can you please take a look at these patches? Thanks, Naveen
[PATCH 2/2] ftrace: Check for successful allocation of hash
In register_ftrace_function_probe(), we are not checking the return value of alloc_and_copy_ftrace_hash(). The subsequent call to ftrace_match_records() may end up dereferencing the same. Add a check to ensure this doesn't happen. Signed-off-by: Naveen N. Rao --- kernel/trace/ftrace.c | 5 + 1 file changed, 5 insertions(+) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 0791eafb693d..0d5f7d4a4936 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4328,6 +4328,11 @@ register_ftrace_function_probe(char *glob, struct trace_array *tr, old_hash = *orig_hash; hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash); + if (!hash) { + ret = -ENOMEM; + goto out; + } + ret = ftrace_match_records(hash, glob, strlen(glob)); /* Nothing found? */ -- 2.22.0
[PATCH 1/2] ftrace: Fix NULL pointer dereference in t_probe_next()
LTP testsuite on powerpc results in the below crash: Unable to handle kernel paging request for data at address 0x Faulting instruction address: 0xc029d800 Oops: Kernel access of bad area, sig: 11 [#1] LE SMP NR_CPUS=2048 NUMA PowerNV ... CPU: 68 PID: 96584 Comm: cat Kdump: loaded Tainted: GW NIP: c029d800 LR: c029dac4 CTR: c01e6ad0 REGS: c0002017fae8ba10 TRAP: 0300 Tainted: GW MSR: 90009033 CR: 28022422 XER: 2004 CFAR: c029d90c DAR: DSISR: 4000 IRQMASK: 0 ... NIP [c029d800] t_probe_next+0x60/0x180 LR [c029dac4] t_mod_start+0x1a4/0x1f0 Call Trace: [c0002017fae8bc90] [c0cdbc40] _cond_resched+0x10/0xb0 (unreliable) [c0002017fae8bce0] [c02a15b0] t_start+0xf0/0x1c0 [c0002017fae8bd30] [c04ec2b4] seq_read+0x184/0x640 [c0002017fae8bdd0] [c04a57bc] sys_read+0x10c/0x300 [c0002017fae8be30] [c000b388] system_call+0x5c/0x70 The test (ftrace_set_ftrace_filter.sh) is part of ftrace stress tests and the crash happens when the test does 'cat $TRACING_PATH/set_ftrace_filter'. The address points to the second line below, in t_probe_next(), where filter_hash is dereferenced: hash = iter->probe->ops.func_hash->filter_hash; size = 1 << hash->size_bits; This happens due to a race with register_ftrace_function_probe(). A new ftrace_func_probe is created and added into the func_probes list in trace_array under ftrace_lock. However, before initializing the filter, we drop ftrace_lock, and re-acquire it after acquiring regex_lock. If another process is trying to read set_ftrace_filter, it will be able to acquire ftrace_lock during this window and it will end up seeing a NULL filter_hash. Fix this by just checking for a NULL filter_hash in t_probe_next(). If the filter_hash is NULL, then this probe is just being added and we can simply return from here. Signed-off-by: Naveen N. Rao --- kernel/trace/ftrace.c | 4 1 file changed, 4 insertions(+) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 7b037295a1f1..0791eafb693d 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3093,6 +3093,10 @@ t_probe_next(struct seq_file *m, loff_t *pos) hnd = >probe_entry->hlist; hash = iter->probe->ops.func_hash->filter_hash; + + if (!hash) + return NULL; + size = 1 << hash->size_bits; retry: -- 2.22.0
[PATCH 0/2] ftrace: two fixes with func_probes handling
Two patches addressing bugs in ftrace function probe handling. The first patch addresses a NULL pointer dereference reported by LTP tests, while the second one is a trivial patch to address a missing check for return value, found by code inspection. - Naveen Naveen N. Rao (2): ftrace: Fix NULL pointer dereference in t_probe_next() ftrace: Check for successful allocation of hash kernel/trace/ftrace.c | 9 + 1 file changed, 9 insertions(+) -- 2.22.0
Re: [PATCH v2 4/7] powerpc/ftrace: Additionally nop out the preceding mflr with -mprofile-kernel
Steven Rostedt wrote: On Thu, 27 Jun 2019 20:58:20 +0530 "Naveen N. Rao" wrote: > But interesting, I don't see a synchronize_rcu_tasks() call > there. We felt we don't need it in this case. We patch the branch to ftrace with a nop first. Other cpus should see that first. But, now that I think about it, should we add a memory barrier to ensure the writes get ordered properly? Do you send an ipi to the other CPUs. I would just to be safe. We are handling this through ftrace_replace_code() and __ftrace_make_call_prep() below. For FTRACE_UPDATE_MAKE_CALL, we patch in the mflr, followed by smp_call_function(isync) and synchronize_rcu_tasks() before we proceed to patch the branch to ftrace. I don't see any other scenario where we end up in __ftrace_make_nop_kernel() without going through ftrace_replace_code(). For kernel modules, this can happen during module load/init and so, I patch out both instructions in __ftrace_make_call() above without any synchronization. Am I missing anything? No, I think I got confused ;-), it's the patch out that I was worried about, but when I was going through the scenario, I somehow turned it into the patching in (which I already audited :-p). I was going to reply with just the top part of this email, but then the confusion started :-/ OK, yes, patching out should be fine, and you already covered the patching in. Sorry for the noise. Just to confirm and totally remove the confusion, the patch does: : mflrr0 <-- preempt here bl _mcount : mflrr0 nop And this is fine regardless. OK, Reviewed-by: Steven Rostedt (VMware) Thanks for confirming! We do need an IPI to be sure, as you pointed out above. I will have the patching out take the same path to simplify things. - Naveen
Re: [PATCH v2 4/7] powerpc/ftrace: Additionally nop out the preceding mflr with -mprofile-kernel
Hi Steven, Thanks for the review! Steven Rostedt wrote: On Thu, 27 Jun 2019 16:53:52 +0530 "Naveen N. Rao" wrote: With -mprofile-kernel, gcc emits 'mflr r0', followed by 'bl _mcount' to enable function tracing and profiling. So far, with dynamic ftrace, we used to only patch out the branch to _mcount(). However, mflr is executed by the branch unit that can only execute one per cycle on POWER9 and shared with branches, so it would be nice to avoid it where possible. We cannot simply nop out the mflr either. When enabling function tracing, there can be a race if tracing is enabled when some thread was interrupted after executing a nop'ed out mflr. In this case, the thread would execute the now-patched-in branch to _mcount() without having executed the preceding mflr. To solve this, we now enable function tracing in 2 steps: patch in the mflr instruction, use 'smp_call_function(isync); synchronize_rcu_tasks()' to ensure all existing threads make progress, and then patch in the branch to _mcount(). We override ftrace_replace_code() with a powerpc64 variant for this purpose. You may want to explain that you do the reverse when patching it out. That is, patch out the "bl _mcount" into a nop and then patch out the "mflr r0". Sure. I think we can add: "When disabling function tracing, we can nop out the two instructions without need for any synchronization in between, as long as we nop out the branch to ftrace first. The lone 'mflr r0' is harmless. Finally, with FTRACE_UPDATE_MODIFY_CALL, no changes are needed since we are simply changing where the branch to ftrace goes." But interesting, I don't see a synchronize_rcu_tasks() call there. We felt we don't need it in this case. We patch the branch to ftrace with a nop first. Other cpus should see that first. But, now that I think about it, should we add a memory barrier to ensure the writes get ordered properly? Suggested-by: Nicholas Piggin Reviewed-by: Nicholas Piggin Signed-off-by: Naveen N. Rao --- arch/powerpc/kernel/trace/ftrace.c | 258 ++--- 1 file changed, 236 insertions(+), 22 deletions(-) diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c index 517662a56bdc..86c2b50dcaa9 100644 --- a/arch/powerpc/kernel/trace/ftrace.c +++ b/arch/powerpc/kernel/trace/ftrace.c @@ -125,7 +125,7 @@ __ftrace_make_nop(struct module *mod, { unsigned long entry, ptr, tramp; unsigned long ip = rec->ip; - unsigned int op, pop; + unsigned int op; /* read where this goes */ if (probe_kernel_read(, (void *)ip, sizeof(int))) { @@ -160,8 +160,6 @@ __ftrace_make_nop(struct module *mod, #ifdef CONFIG_MPROFILE_KERNEL /* When using -mkernel_profile there is no load to jump over */ - pop = PPC_INST_NOP; - if (probe_kernel_read(, (void *)(ip - 4), 4)) { pr_err("Fetching instruction at %lx failed.\n", ip - 4); return -EFAULT; @@ -169,26 +167,23 @@ __ftrace_make_nop(struct module *mod, /* We expect either a mflr r0, or a std r0, LRSAVE(r1) */ if (op != PPC_INST_MFLR && op != PPC_INST_STD_LR) { - pr_err("Unexpected instruction %08x around bl _mcount\n", op); + pr_err("Unexpected instruction %08x before bl _mcount\n", op); return -EINVAL; } -#else - /* -* Our original call site looks like: -* -* bl -* ld r2,XX(r1) -* -* Milton Miller pointed out that we can not simply nop the branch. -* If a task was preempted when calling a trace function, the nops -* will remove the way to restore the TOC in r2 and the r2 TOC will -* get corrupted. -* -* Use a b +8 to jump over the load. -*/ - pop = PPC_INST_BRANCH | 8; /* b +8 */ + /* We should patch out the bl to _mcount first */ + if (patch_instruction((unsigned int *)ip, PPC_INST_NOP)) { + pr_err("Patching NOP failed.\n"); + return -EPERM; + } + /* then, nop out the preceding 'mflr r0' as an optimization */ + if (op == PPC_INST_MFLR && + patch_instruction((unsigned int *)(ip - 4), PPC_INST_NOP)) { + pr_err("Patching NOP failed.\n"); + return -EPERM; + } +#else /* * Check what is in the next instruction. We can see ld r2,40(r1), but * on first pass after boot we will see mflr r0. @@ -202,12 +197,25 @@ __ftrace_make_nop(struct module *mod, pr_err("Expected %08x found %08x\n", PPC_INST_LD_TOC, op); return -EINVAL; } -#endif /* CONFIG_MPROFILE_KERNEL */ - if (patch_instruction((unsigned int *)ip, pop)) { + /* +* Our original call site looks like: +* +* bl +* ld r2,XX(r1) +
Re: [PATCH v2 4/7] powerpc/ftrace: Additionally nop out the preceding mflr with -mprofile-kernel
Naveen N. Rao wrote: With -mprofile-kernel, gcc emits 'mflr r0', followed by 'bl _mcount' to enable function tracing and profiling. So far, with dynamic ftrace, we used to only patch out the branch to _mcount(). However, mflr is executed by the branch unit that can only execute one per cycle on POWER9 and shared with branches, so it would be nice to avoid it where possible. We cannot simply nop out the mflr either. When enabling function tracing, there can be a race if tracing is enabled when some thread was interrupted after executing a nop'ed out mflr. In this case, the thread would execute the now-patched-in branch to _mcount() without having executed the preceding mflr. To solve this, we now enable function tracing in 2 steps: patch in the mflr instruction, use 'smp_call_function(isync); synchronize_rcu_tasks()' to ensure all existing threads make progress, and then patch in the branch to _mcount(). We override ftrace_replace_code() with a powerpc64 variant for this purpose. Suggested-by: Nicholas Piggin Reviewed-by: Nicholas Piggin Signed-off-by: Naveen N. Rao --- arch/powerpc/kernel/trace/ftrace.c | 258 ++--- 1 file changed, 236 insertions(+), 22 deletions(-) I missed adding a comment here to explain the changes. As discussed in the previous series, I think we are ok with this patch from a CMODX perspective. For smp_call_function(), I decided to have it included in this patch since we know that we need it here for sure. I am not entirely sure we want to do that in patch_instruction() since ftrace doesn't seem to need it elsewhere. As Nick Piggin pointed out, we may want to have users of patch_instruction() (kprobes) add the necessary synchronization. Thanks, Naveen
Re: [PATCH v2 2/7] x86/ftrace: Fix use of flags in ftrace_replace_code()
Steven Rostedt wrote: On Thu, 27 Jun 2019 16:53:50 +0530 "Naveen N. Rao" wrote: In commit a0572f687fb3c ("ftrace: Allow ftrace_replace_code() to be schedulable), the generic ftrace_replace_code() function was modified to accept a flags argument in place of a single 'enable' flag. However, the x86 version of this function was not updated. Fix the same. Fixes: a0572f687fb3c ("ftrace: Allow ftrace_replace_code() to be schedulable") I don't mind this change, but it's not a bug, and I'm not sure it should have the fixes tag. The reason being, the FTRACE_MODIFY_ENABLE_FL is only set when ftrace is called by with the command flag FTRACE_MAY_SLEEP, which is never done on x86. I guess you meant to say that *FTRACE_MODIFY_MAY_SLEEP_FL* is only set with FTRACE_MAY_SLEEP. That said, I'm fine with the change as it makes it more robust, but by adding the fixes tag, you're going to get this into all the stable code, and I'm not sure that's really necessary. Agreed. Thanks for pointing this out. We can drop this patch from this series and I will re-post this as a simpler cleanup later on. Thanks, Naveen
Re: [PATCH v2 2/7] x86/ftrace: Fix use of flags in ftrace_replace_code()
Naveen N. Rao wrote: In commit a0572f687fb3c ("ftrace: Allow ftrace_replace_code() to be schedulable), the generic ftrace_replace_code() function was modified to accept a flags argument in place of a single 'enable' flag. However, the x86 version of this function was not updated. Fix the same. Fixes: a0572f687fb3c ("ftrace: Allow ftrace_replace_code() to be schedulable") Signed-off-by: Naveen N. Rao --- arch/x86/kernel/ftrace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) If the rest of this series is ok, and if Ingo and Steven are ok to have this series go through the powerpc tree, then I can re-post this particular patch for x86 after -rc1. Thanks, Naveen
[PATCH v2 6/7] kprobes/ftrace: Use ftrace_location() when [dis]arming probes
Ftrace location could include more than a single instruction in case of some architectures (powerpc64, for now). In this case, kprobe is permitted on any of those instructions, and uses ftrace infrastructure for functioning. However, [dis]arm_kprobe_ftrace() uses the kprobe address when setting up ftrace filter IP. This won't work if the address points to any instruction apart from the one that has a branch to _mcount(). To resolve this, have [dis]arm_kprobe_ftrace() use ftrace_function() to identify the filter IP. Acked-by: Masami Hiramatsu Signed-off-by: Naveen N. Rao --- kernel/kprobes.c | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 445337c107e0..282ee704e2d8 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -978,10 +978,10 @@ static int prepare_kprobe(struct kprobe *p) /* Caller must lock kprobe_mutex */ static int arm_kprobe_ftrace(struct kprobe *p) { + unsigned long ftrace_ip = ftrace_location((unsigned long)p->addr); int ret = 0; - ret = ftrace_set_filter_ip(_ftrace_ops, - (unsigned long)p->addr, 0, 0); + ret = ftrace_set_filter_ip(_ftrace_ops, ftrace_ip, 0, 0); if (ret) { pr_debug("Failed to arm kprobe-ftrace at %pS (%d)\n", p->addr, ret); @@ -1005,13 +1005,14 @@ static int arm_kprobe_ftrace(struct kprobe *p) * non-empty filter_hash for IPMODIFY ops, we're safe from an accidental * empty filter_hash which would undesirably trace all functions. */ - ftrace_set_filter_ip(_ftrace_ops, (unsigned long)p->addr, 1, 0); + ftrace_set_filter_ip(_ftrace_ops, ftrace_ip, 1, 0); return ret; } /* Caller must lock kprobe_mutex */ static int disarm_kprobe_ftrace(struct kprobe *p) { + unsigned long ftrace_ip = ftrace_location((unsigned long)p->addr); int ret = 0; if (kprobe_ftrace_enabled == 1) { @@ -1022,8 +1023,7 @@ static int disarm_kprobe_ftrace(struct kprobe *p) kprobe_ftrace_enabled--; - ret = ftrace_set_filter_ip(_ftrace_ops, - (unsigned long)p->addr, 1, 0); + ret = ftrace_set_filter_ip(_ftrace_ops, ftrace_ip, 1, 0); WARN_ONCE(ret < 0, "Failed to disarm kprobe-ftrace at %pS (%d)\n", p->addr, ret); return ret; -- 2.22.0
[PATCH v2 5/7] ftrace: Update ftrace_location() for powerpc -mprofile-kernel
Now that we are patching the preceding 'mflr r0' instruction with -mprofile-kernel, we need to update ftrace_location() to recognise that as being part of ftrace. To do this, we introduce FTRACE_IP_EXTENSION to denote the length (in bytes) of the mcount caller. By default, this is set to 0. For powerpc with CONFIG_MPROFILE_KERNEL, we set this to MCOUNT_INSN_SIZE so that this works if ftrace_location() is called with the address of the actual ftrace entry call, or with the address of the preceding 'mflr r0' instruction. Note that we do not check if the preceding instruction is indeed the 'mflr r0'. Earlier -mprofile-kernel ABI included a 'std r0,stack' instruction between the 'mflr r0' and the 'bl _mcount'. This is harmless as the 'std r0,stack' instruction is inconsequential and is not relied upon. Suggested-by: Steven Rostedt (VMware) Signed-off-by: Naveen N. Rao --- arch/powerpc/include/asm/ftrace.h | 8 include/linux/ftrace.h| 9 + kernel/trace/ftrace.c | 2 +- 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h index 3dfb80b86561..510a8ac8ac8d 100644 --- a/arch/powerpc/include/asm/ftrace.h +++ b/arch/powerpc/include/asm/ftrace.h @@ -61,6 +61,14 @@ struct dyn_arch_ftrace { #ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS #define ARCH_SUPPORTS_FTRACE_OPS 1 + +#ifdef CONFIG_MPROFILE_KERNEL +/* + * We consider two instructions -- 'mflr r0', 'bl _mcount' -- to be part + * of ftrace with -mprofile-kernel + */ +#define FTRACE_IP_EXTENSIONMCOUNT_INSN_SIZE +#endif #endif #endif /* CONFIG_FUNCTION_TRACER */ diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index fa653a561da5..310b514438cb 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -29,6 +29,15 @@ #define ARCH_SUPPORTS_FTRACE_OPS 0 #endif +/* + * This denotes the number of instructions (in bytes) that is used by the + * arch mcount caller. All instructions in this range will be owned by + * ftrace. + */ +#ifndef FTRACE_IP_EXTENSION +#define FTRACE_IP_EXTENSION 0 +#endif + /* * If the arch's mcount caller does not support all of ftrace's * features, then it must call an indirect function that diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 21d8e201ee80..308555925b81 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1575,7 +1575,7 @@ unsigned long ftrace_location_range(unsigned long start, unsigned long end) */ unsigned long ftrace_location(unsigned long ip) { - return ftrace_location_range(ip, ip); + return ftrace_location_range(ip, ip + FTRACE_IP_EXTENSION); } /** -- 2.22.0
[PATCH v2 2/7] x86/ftrace: Fix use of flags in ftrace_replace_code()
In commit a0572f687fb3c ("ftrace: Allow ftrace_replace_code() to be schedulable), the generic ftrace_replace_code() function was modified to accept a flags argument in place of a single 'enable' flag. However, the x86 version of this function was not updated. Fix the same. Fixes: a0572f687fb3c ("ftrace: Allow ftrace_replace_code() to be schedulable") Signed-off-by: Naveen N. Rao --- arch/x86/kernel/ftrace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 0927bb158ffc..f34005a17051 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -573,8 +573,9 @@ static void run_sync(void) local_irq_disable(); } -void ftrace_replace_code(int enable) +void ftrace_replace_code(int mod_flags) { + int enable = mod_flags & FTRACE_MODIFY_ENABLE_FL; struct ftrace_rec_iter *iter; struct dyn_ftrace *rec; const char *report = "adding breakpoints"; -- 2.22.0
[PATCH v2 0/7] powerpc/ftrace: Patch out -mprofile-kernel instructions
Changes since v1 (https://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=114556): - Patches 1,2,3 and 6: No changes - Patch 4: Add smp_call_function() to flush icache on all cpus after patching in the 'mflr r0' instruction. - Patch 5: Changes as per Steven Rostedt's suggestions. - Patch 7: Changes as per Masami and Joe Perches. -- On powerpc64, -mprofile-kernel results in two instructions being emitted: 'mflr r0' and 'bl _mcount'. So far, we were only nop'ing out the branch to _mcount(). This series implements an approach to also nop out the preceding mflr. - Naveen Naveen N. Rao (7): ftrace: Expose flags used for ftrace_replace_code() x86/ftrace: Fix use of flags in ftrace_replace_code() ftrace: Expose __ftrace_replace_code() powerpc/ftrace: Additionally nop out the preceding mflr with -mprofile-kernel ftrace: Update ftrace_location() for powerpc -mprofile-kernel kprobes/ftrace: Use ftrace_location() when [dis]arming probes powerpc/kprobes: Allow probing on any ftrace address arch/powerpc/include/asm/ftrace.h| 8 + arch/powerpc/kernel/kprobes-ftrace.c | 32 +++- arch/powerpc/kernel/trace/ftrace.c | 258 --- arch/x86/kernel/ftrace.c | 3 +- include/linux/ftrace.h | 15 ++ kernel/kprobes.c | 10 +- kernel/trace/ftrace.c| 15 +- 7 files changed, 302 insertions(+), 39 deletions(-) -- 2.22.0
[PATCH v2 1/7] ftrace: Expose flags used for ftrace_replace_code()
Since ftrace_replace_code() is a __weak function and can be overridden, we need to expose the flags that can be set. So, move the flags enum to the header file. Reviewed-by: Steven Rostedt (VMware) Signed-off-by: Naveen N. Rao --- include/linux/ftrace.h | 5 + kernel/trace/ftrace.c | 5 - 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 25e2995d4a4c..e97789c95c4e 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -162,6 +162,11 @@ enum { FTRACE_OPS_FL_TRACE_ARRAY = 1 << 15, }; +enum { + FTRACE_MODIFY_ENABLE_FL = (1 << 0), + FTRACE_MODIFY_MAY_SLEEP_FL = (1 << 1), +}; + #ifdef CONFIG_DYNAMIC_FTRACE /* The hash used to know what functions callbacks trace */ struct ftrace_ops_hash { diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 38277af44f5c..5710a6b3edc1 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -75,11 +75,6 @@ #define INIT_OPS_HASH(opsname) #endif -enum { - FTRACE_MODIFY_ENABLE_FL = (1 << 0), - FTRACE_MODIFY_MAY_SLEEP_FL = (1 << 1), -}; - struct ftrace_ops ftrace_list_end __read_mostly = { .func = ftrace_stub, .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB, -- 2.22.0
[PATCH v2 7/7] powerpc/kprobes: Allow probing on any ftrace address
With KPROBES_ON_FTRACE, kprobe is allowed to be inserted on instructions that branch to _mcount (referred to as ftrace location). With -mprofile-kernel, we now include the preceding 'mflr r0' as being part of the ftrace location. However, by default, probing on an instruction that is not actually the branch to _mcount() is prohibited, as that is considered to not be at an instruction boundary. This is not the case on powerpc, so allow the same by overriding arch_check_ftrace_location() In addition, we update kprobe_ftrace_handler() to detect this scenarios and to pass the proper nip to the pre and post probe handlers. Signed-off-by: Naveen N. Rao --- arch/powerpc/kernel/kprobes-ftrace.c | 32 +++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/kprobes-ftrace.c b/arch/powerpc/kernel/kprobes-ftrace.c index 972cb28174b2..23c840748183 100644 --- a/arch/powerpc/kernel/kprobes-ftrace.c +++ b/arch/powerpc/kernel/kprobes-ftrace.c @@ -12,15 +12,35 @@ #include #include +/* + * With -mprofile-kernel, we patch two instructions -- the branch to _mcount + * as well as the preceding 'mflr r0'. Both these instructions are claimed + * by ftrace and we should allow probing on either instruction. + */ +int arch_check_ftrace_location(struct kprobe *p) +{ + if (ftrace_location((unsigned long)p->addr)) + p->flags |= KPROBE_FLAG_FTRACE; + return 0; +} + /* Ftrace callback handler for kprobes */ void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip, struct ftrace_ops *ops, struct pt_regs *regs) { struct kprobe *p; + int mflr_kprobe = 0; struct kprobe_ctlblk *kcb; p = get_kprobe((kprobe_opcode_t *)nip); - if (unlikely(!p) || kprobe_disabled(p)) + if (!p) { + p = get_kprobe((kprobe_opcode_t *)(nip - MCOUNT_INSN_SIZE)); + if (unlikely(!p)) + return; + mflr_kprobe = 1; + } + + if (kprobe_disabled(p)) return; kcb = get_kprobe_ctlblk(); @@ -33,6 +53,9 @@ void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip, */ regs->nip -= MCOUNT_INSN_SIZE; + if (mflr_kprobe) + regs->nip -= MCOUNT_INSN_SIZE; + __this_cpu_write(current_kprobe, p); kcb->kprobe_status = KPROBE_HIT_ACTIVE; if (!p->pre_handler || !p->pre_handler(p, regs)) { @@ -45,6 +68,8 @@ void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip, kcb->kprobe_status = KPROBE_HIT_SSDONE; p->post_handler(p, regs, 0); } + if (mflr_kprobe) + regs->nip += MCOUNT_INSN_SIZE; } /* * If pre_handler returns !0, it changes regs->nip. We have to @@ -57,6 +82,11 @@ NOKPROBE_SYMBOL(kprobe_ftrace_handler); int arch_prepare_kprobe_ftrace(struct kprobe *p) { + if ((unsigned long)p->addr & 0x03) { + pr_err("Attempt to register kprobe at an unaligned address\n"); + return -EILSEQ; + } + p->ainsn.insn = NULL; p->ainsn.boostable = -1; return 0; -- 2.22.0
[PATCH v2 4/7] powerpc/ftrace: Additionally nop out the preceding mflr with -mprofile-kernel
With -mprofile-kernel, gcc emits 'mflr r0', followed by 'bl _mcount' to enable function tracing and profiling. So far, with dynamic ftrace, we used to only patch out the branch to _mcount(). However, mflr is executed by the branch unit that can only execute one per cycle on POWER9 and shared with branches, so it would be nice to avoid it where possible. We cannot simply nop out the mflr either. When enabling function tracing, there can be a race if tracing is enabled when some thread was interrupted after executing a nop'ed out mflr. In this case, the thread would execute the now-patched-in branch to _mcount() without having executed the preceding mflr. To solve this, we now enable function tracing in 2 steps: patch in the mflr instruction, use 'smp_call_function(isync); synchronize_rcu_tasks()' to ensure all existing threads make progress, and then patch in the branch to _mcount(). We override ftrace_replace_code() with a powerpc64 variant for this purpose. Suggested-by: Nicholas Piggin Reviewed-by: Nicholas Piggin Signed-off-by: Naveen N. Rao --- arch/powerpc/kernel/trace/ftrace.c | 258 ++--- 1 file changed, 236 insertions(+), 22 deletions(-) diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c index 517662a56bdc..86c2b50dcaa9 100644 --- a/arch/powerpc/kernel/trace/ftrace.c +++ b/arch/powerpc/kernel/trace/ftrace.c @@ -125,7 +125,7 @@ __ftrace_make_nop(struct module *mod, { unsigned long entry, ptr, tramp; unsigned long ip = rec->ip; - unsigned int op, pop; + unsigned int op; /* read where this goes */ if (probe_kernel_read(, (void *)ip, sizeof(int))) { @@ -160,8 +160,6 @@ __ftrace_make_nop(struct module *mod, #ifdef CONFIG_MPROFILE_KERNEL /* When using -mkernel_profile there is no load to jump over */ - pop = PPC_INST_NOP; - if (probe_kernel_read(, (void *)(ip - 4), 4)) { pr_err("Fetching instruction at %lx failed.\n", ip - 4); return -EFAULT; @@ -169,26 +167,23 @@ __ftrace_make_nop(struct module *mod, /* We expect either a mflr r0, or a std r0, LRSAVE(r1) */ if (op != PPC_INST_MFLR && op != PPC_INST_STD_LR) { - pr_err("Unexpected instruction %08x around bl _mcount\n", op); + pr_err("Unexpected instruction %08x before bl _mcount\n", op); return -EINVAL; } -#else - /* -* Our original call site looks like: -* -* bl -* ld r2,XX(r1) -* -* Milton Miller pointed out that we can not simply nop the branch. -* If a task was preempted when calling a trace function, the nops -* will remove the way to restore the TOC in r2 and the r2 TOC will -* get corrupted. -* -* Use a b +8 to jump over the load. -*/ - pop = PPC_INST_BRANCH | 8; /* b +8 */ + /* We should patch out the bl to _mcount first */ + if (patch_instruction((unsigned int *)ip, PPC_INST_NOP)) { + pr_err("Patching NOP failed.\n"); + return -EPERM; + } + /* then, nop out the preceding 'mflr r0' as an optimization */ + if (op == PPC_INST_MFLR && + patch_instruction((unsigned int *)(ip - 4), PPC_INST_NOP)) { + pr_err("Patching NOP failed.\n"); + return -EPERM; + } +#else /* * Check what is in the next instruction. We can see ld r2,40(r1), but * on first pass after boot we will see mflr r0. @@ -202,12 +197,25 @@ __ftrace_make_nop(struct module *mod, pr_err("Expected %08x found %08x\n", PPC_INST_LD_TOC, op); return -EINVAL; } -#endif /* CONFIG_MPROFILE_KERNEL */ - if (patch_instruction((unsigned int *)ip, pop)) { + /* +* Our original call site looks like: +* +* bl +* ld r2,XX(r1) +* +* Milton Miller pointed out that we can not simply nop the branch. +* If a task was preempted when calling a trace function, the nops +* will remove the way to restore the TOC in r2 and the r2 TOC will +* get corrupted. +* +* Use a b +8 to jump over the load. +*/ + if (patch_instruction((unsigned int *)ip, PPC_INST_BRANCH | 8)) { pr_err("Patching NOP failed.\n"); return -EPERM; } +#endif /* CONFIG_MPROFILE_KERNEL */ return 0; } @@ -421,6 +429,26 @@ static int __ftrace_make_nop_kernel(struct dyn_ftrace *rec, unsigned long addr) return -EPERM; } +#ifdef CONFIG_MPROFILE_KERNEL + /* Nop out the preceding 'mflr r0' as an optimization */ + if (probe_kernel_read(, (void *)(ip - 4), 4)) { + pr_err("Fetching instruction at %lx failed.\n", ip - 4); +
[PATCH v2 3/7] ftrace: Expose __ftrace_replace_code()
While over-riding ftrace_replace_code(), we still want to reuse the existing __ftrace_replace_code() function. Rename the function and make it available for other kernel code. Signed-off-by: Naveen N. Rao --- include/linux/ftrace.h | 1 + kernel/trace/ftrace.c | 8 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index e97789c95c4e..fa653a561da5 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -456,6 +456,7 @@ ftrace_set_early_filter(struct ftrace_ops *ops, char *buf, int enable); /* defined in arch */ extern int ftrace_ip_converted(unsigned long ip); extern int ftrace_dyn_arch_init(void); +extern int ftrace_replace_code_rec(struct dyn_ftrace *rec, int enable); extern void ftrace_replace_code(int enable); extern int ftrace_update_ftrace_func(ftrace_func_t func); extern void ftrace_caller(void); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 5710a6b3edc1..21d8e201ee80 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2351,8 +2351,8 @@ unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec) return (unsigned long)FTRACE_ADDR; } -static int -__ftrace_replace_code(struct dyn_ftrace *rec, int enable) +int +ftrace_replace_code_rec(struct dyn_ftrace *rec, int enable) { unsigned long ftrace_old_addr; unsigned long ftrace_addr; @@ -2403,7 +2403,7 @@ void __weak ftrace_replace_code(int mod_flags) if (rec->flags & FTRACE_FL_DISABLED) continue; - failed = __ftrace_replace_code(rec, enable); + failed = ftrace_replace_code_rec(rec, enable); if (failed) { ftrace_bug(failed, rec); /* Stop processing */ @@ -5827,7 +5827,7 @@ void ftrace_module_enable(struct module *mod) rec->flags = cnt; if (ftrace_start_up && cnt) { - int failed = __ftrace_replace_code(rec, 1); + int failed = ftrace_replace_code_rec(rec, 1); if (failed) { ftrace_bug(failed, rec); goto out_loop; -- 2.22.0
[PATCH] recordmcount: Fix spurious mcount entries on powerpc
The recent change enabling HAVE_C_RECORDMCOUNT on powerpc started showing the following issue: # modprobe kprobe_example ftrace-powerpc: Not expected bl: opcode is 3c4c0001 WARNING: CPU: 0 PID: 227 at kernel/trace/ftrace.c:2001 ftrace_bug+0x90/0x318 Modules linked in: CPU: 0 PID: 227 Comm: modprobe Not tainted 5.2.0-rc6-00678-g1c329100b942 #2 NIP: c0264318 LR: c025d694 CTR: c0f5cd30 REGS: c1f2b7b0 TRAP: 0700 Not tainted (5.2.0-rc6-00678-g1c329100b942) MSR: 90010282b033 CR: 28228222 XER: CFAR: c02642fc IRQMASK: 0 NIP [c0264318] ftrace_bug+0x90/0x318 LR [c025d694] ftrace_process_locs+0x4f4/0x5e0 Call Trace: [c1f2ba40] [0004] 0x4 (unreliable) [c1f2bad0] [c025d694] ftrace_process_locs+0x4f4/0x5e0 [c1f2bb90] [c020ff10] load_module+0x25b0/0x30c0 [c1f2bd00] [c0210cb0] sys_finit_module+0xc0/0x130 [c1f2be20] [c000bda4] system_call+0x5c/0x70 Instruction dump: 419e0018 2f83 419e00bc 2f83ffea 409e00cc 481c 0fe0 3c62ff96 3901 3940 386386d0 48c4 <0fe0> 3ce20003 3901 3c62ff96 ---[ end trace 4c438d5cebf78381 ]--- ftrace failed to modify [] 0xc008012a0008 actual: 01:00:4c:3c Initializing ftrace call sites ftrace record flags: 200 (0) expected tramp: c006af4c Looking at the relocation records in __mcount_loc showed a few spurious entries: RELOCATION RECORDS FOR [__mcount_loc]: OFFSET TYPE VALUE R_PPC64_ADDR64.text.unlikely+0x0008 0008 R_PPC64_ADDR64.text.unlikely+0x0014 0010 R_PPC64_ADDR64.text.unlikely+0x0060 0018 R_PPC64_ADDR64.text.unlikely+0x00b4 0020 R_PPC64_ADDR64.init.text+0x0008 0028 R_PPC64_ADDR64.init.text+0x0014 The first entry in each section is incorrect. Looking at the relocation records, the spurious entries correspond to the R_PPC64_ENTRY records: RELOCATION RECORDS FOR [.text.unlikely]: OFFSET TYPE VALUE R_PPC64_REL64 .TOC.-0x0008 0008 R_PPC64_ENTRY *ABS* 0014 R_PPC64_REL24 _mcount The problem is that we are not validating the return value from get_mcountsym() in sift_rel_mcount(). With this entry, mcountsym is 0, but Elf_r_sym(relp) also ends up being 0. Fix this by ensuring mcountsym is valid before processing the entry. Fixes: c7d64b560ce80 ("powerpc/ftrace: Enable C Version of recordmcount") Signed-off-by: Naveen N. Rao --- scripts/recordmcount.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/recordmcount.h b/scripts/recordmcount.h index 13c5e6c8829c..47fca2c69a73 100644 --- a/scripts/recordmcount.h +++ b/scripts/recordmcount.h @@ -325,7 +325,8 @@ static uint_t *sift_rel_mcount(uint_t *mlocp, if (!mcountsym) mcountsym = get_mcountsym(sym0, relp, str0); - if (mcountsym == Elf_r_sym(relp) && !is_fake_mcount(relp)) { + if (mcountsym && mcountsym == Elf_r_sym(relp) && + !is_fake_mcount(relp)) { uint_t const addend = _w(_w(relp->r_offset) - recval + mcount_adjust); mrelp->r_offset = _w(offbase -- 2.22.0
Re: [PATCH 7/7] powerpc/kprobes: Allow probing on any ftrace address
Masami Hiramatsu wrote: On Tue, 18 Jun 2019 20:17:06 +0530 "Naveen N. Rao" wrote: With KPROBES_ON_FTRACE, kprobe is allowed to be inserted on instructions that branch to _mcount (referred to as ftrace location). With -mprofile-kernel, we now include the preceding 'mflr r0' as being part of the ftrace location. However, by default, probing on an instruction that is not actually the branch to _mcount() is prohibited, as that is considered to not be at an instruction boundary. This is not the case on powerpc, so allow the same by overriding arch_check_ftrace_location() In addition, we update kprobe_ftrace_handler() to detect this scenarios and to pass the proper nip to the pre and post probe handlers. Signed-off-by: Naveen N. Rao --- arch/powerpc/kernel/kprobes-ftrace.c | 30 1 file changed, 30 insertions(+) diff --git a/arch/powerpc/kernel/kprobes-ftrace.c b/arch/powerpc/kernel/kprobes-ftrace.c index 972cb28174b2..6a0bd3c16cb6 100644 --- a/arch/powerpc/kernel/kprobes-ftrace.c +++ b/arch/powerpc/kernel/kprobes-ftrace.c @@ -12,14 +12,34 @@ #include #include +/* + * With -mprofile-kernel, we patch two instructions -- the branch to _mcount + * as well as the preceding 'mflr r0'. Both these instructions are claimed + * by ftrace and we should allow probing on either instruction. + */ +int arch_check_ftrace_location(struct kprobe *p) +{ + if (ftrace_location((unsigned long)p->addr)) + p->flags |= KPROBE_FLAG_FTRACE; + return 0; +} + /* Ftrace callback handler for kprobes */ void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip, struct ftrace_ops *ops, struct pt_regs *regs) { struct kprobe *p; + int mflr_kprobe = 0; struct kprobe_ctlblk *kcb; p = get_kprobe((kprobe_opcode_t *)nip); + if (unlikely(!p)) { Hmm, is this really unlikely? If we put a kprobe on the second instruction address, we will see p == NULL always. + p = get_kprobe((kprobe_opcode_t *)(nip - MCOUNT_INSN_SIZE)); + if (!p) Here will be unlikely, because we can not find kprobe at both of nip and nip - MCOUNT_INSN_SIZE. + return; + mflr_kprobe = 1; + } + if (unlikely(!p) || kprobe_disabled(p)) "unlikely(!p)" is not needed here. ... Joe Perches wrote: On Fri, 2019-06-21 at 23:50 +0900, Masami Hiramatsu wrote: On Tue, 18 Jun 2019 20:17:06 +0530 "Naveen N. Rao" wrote: trivia: > diff --git a/arch/powerpc/kernel/kprobes-ftrace.c b/arch/powerpc/kernel/kprobes-ftrace.c [] > @@ -57,6 +82,11 @@ NOKPROBE_SYMBOL(kprobe_ftrace_handler); > > int arch_prepare_kprobe_ftrace(struct kprobe *p) > { > + if ((unsigned long)p->addr & 0x03) { > + printk("Attempt to register kprobe at an unaligned address\n"); Please use the appropriate KERN_ or pr_ All good points. Thanks for the review. - Naveen
Re: [PATCH 4/7] powerpc/ftrace: Additionally nop out the preceding mflr with -mprofile-kernel
Nicholas Piggin wrote: Naveen N. Rao's on June 19, 2019 7:53 pm: Nicholas Piggin wrote: Michael Ellerman's on June 19, 2019 3:14 pm: I'm also not convinced the ordering between the two patches is guaranteed by the ISA, given that there's possibly no isync on the other CPU. Will they go through a context synchronizing event? synchronize_rcu_tasks() should ensure a thread is scheduled away, but I'm not actually sure it guarantees CSI if it's kernel->kernel. Could do a smp_call_function to do the isync on each CPU to be sure. Good point. Per Documentation/RCU/Design/Requirements/Requirements.html#Tasks RCU: "The solution, in the form of Tasks RCU, is to have implicit read-side critical sections that are delimited by voluntary context switches, that is, calls to schedule(), cond_resched(), and synchronize_rcu_tasks(). In addition, transitions to and from userspace execution also delimit tasks-RCU read-side critical sections." I suppose transitions to/from userspace, as well as calls to schedule() result in context synchronizing instruction being executed. But, if some tasks call cond_resched() and synchronize_rcu_tasks(), we probably won't have a CSI executed. Also: "In CONFIG_PREEMPT=n kernels, trampolines cannot be preempted, so these APIs map to call_rcu(), synchronize_rcu(), and rcu_barrier(), respectively." In this scenario as well, I think we won't have a CSI executed in case of cond_resched(). Should we enhance patch_instruction() to handle that? Well, not sure. Do we have many post-boot callers of it? Should they take care of their own synchronization requirements? Kprobes and ftrace are the two users (along with anything else that may use jump labels). Looking at this from the CMODX perspective: the main example quoted of an erratic behavior is when any variant of the patched instruction causes an exception. With ftrace, I think we are ok since we only ever patch a 'nop' or a 'bl' (and the 'mflr' now), none of which should cause an exception. As such, the existing patch_instruction() should suffice. However, with kprobes, we patch a 'trap' (or a branch in case of optprobes) on most instructions. I wonder if we should be issuing an 'isync' on all cpus in this case. Or, even if that is sufficient or necessary. Thanks, Naveen
Re: [PATCH 4/7] powerpc/ftrace: Additionally nop out the preceding mflr with -mprofile-kernel
Nicholas Piggin wrote: Michael Ellerman's on June 19, 2019 3:14 pm: Hi Naveen, Sorry I meant to reply to this earlier .. :/ No problem. Thanks for the questions. "Naveen N. Rao" writes: With -mprofile-kernel, gcc emits 'mflr r0', followed by 'bl _mcount' to enable function tracing and profiling. So far, with dynamic ftrace, we used to only patch out the branch to _mcount(). However, mflr is executed by the branch unit that can only execute one per cycle on POWER9 and shared with branches, so it would be nice to avoid it where possible. We cannot simply nop out the mflr either. When enabling function tracing, there can be a race if tracing is enabled when some thread was interrupted after executing a nop'ed out mflr. In this case, the thread would execute the now-patched-in branch to _mcount() without having executed the preceding mflr. To solve this, we now enable function tracing in 2 steps: patch in the mflr instruction, use synchronize_rcu_tasks() to ensure all existing threads make progress, and then patch in the branch to _mcount(). We override ftrace_replace_code() with a powerpc64 variant for this purpose. According to the ISA we're not allowed to patch mflr at runtime. See the section on "CMODX". According to "quasi patch class" engineering note, we can patch anything with a preferred nop. But that's written as an optional facility, which we don't have a feature to test for. Hmm... I wonder what the implications are. We've been patching in a 'trap' for kprobes for a long time now, along with having to patch back the original instruction (which can be anything), when the probe is removed. I'm also not convinced the ordering between the two patches is guaranteed by the ISA, given that there's possibly no isync on the other CPU. Will they go through a context synchronizing event? synchronize_rcu_tasks() should ensure a thread is scheduled away, but I'm not actually sure it guarantees CSI if it's kernel->kernel. Could do a smp_call_function to do the isync on each CPU to be sure. Good point. Per Documentation/RCU/Design/Requirements/Requirements.html#Tasks RCU: "The solution, in the form of Tasks RCU, is to have implicit read-side critical sections that are delimited by voluntary context switches, that is, calls to schedule(), cond_resched(), and synchronize_rcu_tasks(). In addition, transitions to and from userspace execution also delimit tasks-RCU read-side critical sections." I suppose transitions to/from userspace, as well as calls to schedule() result in context synchronizing instruction being executed. But, if some tasks call cond_resched() and synchronize_rcu_tasks(), we probably won't have a CSI executed. Also: "In CONFIG_PREEMPT=n kernels, trampolines cannot be preempted, so these APIs map to call_rcu(), synchronize_rcu(), and rcu_barrier(), respectively." In this scenario as well, I think we won't have a CSI executed in case of cond_resched(). Should we enhance patch_instruction() to handle that? - Naveen
Re: [PATCH 5/7] powerpc/ftrace: Update ftrace_location() for powerpc -mprofile-kernel
Steven Rostedt wrote: On Tue, 18 Jun 2019 23:53:11 +0530 "Naveen N. Rao" wrote: Naveen N. Rao wrote: > Steven Rostedt wrote: >> On Tue, 18 Jun 2019 20:17:04 +0530 >> "Naveen N. Rao" wrote: >> >>> @@ -1551,7 +1551,7 @@ unsigned long ftrace_location_range(unsigned long start, unsigned long end) >>>key.flags = end;/* overload flags, as it is unsigned long */ >>> >>> for (pg = ftrace_pages_start; pg; pg = pg->next) { >>> - if (end < pg->records[0].ip || >>> + if (end <= pg->records[0].ip || >> >> This breaks the algorithm. "end" is inclusive. That is, if you look for >> a single byte, where "start" and "end" are the same, and it happens to >> be the first ip on the pg page, it will be skipped, and not found. > > Thanks. It looks like I should be over-riding ftrace_location() instead. > I will update this patch. I think I will have ftrace own the two instruction range, regardless of whether the preceding instruction is a 'mflr r0' or not. This simplifies things and I don't see an issue with it as of now. I will do more testing to confirm. - Naveen --- a/arch/powerpc/kernel/trace/ftrace.c +++ b/arch/powerpc/kernel/trace/ftrace.c @@ -951,6 +951,16 @@ void arch_ftrace_update_code(int command) } #ifdef CONFIG_MPROFILE_KERNEL +/* + * We consider two instructions -- 'mflr r0', 'bl _mcount' -- to be part + * of ftrace. When checking for the first instruction, we want to include + * the next instruction in the range check. + */ +unsigned long ftrace_location(unsigned long ip) +{ + return ftrace_location_range(ip, ip + MCOUNT_INSN_SIZE); +} + /* Returns 1 if we patched in the mflr */ static int __ftrace_make_call_prep(struct dyn_ftrace *rec) { diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 21d8e201ee80..122e2bb4a739 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1573,7 +1573,7 @@ unsigned long ftrace_location_range(unsigned long start, unsigned long end) * the function tracer. It checks the ftrace internal tables to * determine if the address belongs or not. */ -unsigned long ftrace_location(unsigned long ip) +unsigned long __weak ftrace_location(unsigned long ip) { return ftrace_location_range(ip, ip); } Actually, instead of making this a weak function, let's do this: In include/ftrace.h: #ifndef FTRACE_IP_EXTENSION # define FTRACE_IP_EXTENSION0 #endif In arch/powerpc/include/asm/ftrace.h #define FTRACE_IP_EXTENSION MCOUNT_INSN_SIZE Then we can just have: unsigned long ftrace_location(unsigned long ip) { return ftrace_location_range(ip, ip + FTRACE_IP_EXTENSION); } Thanks, that's indeed nice. I hope you don't mind me adding your SOB for that. - Naveen
Re: [PATCH 5/7] powerpc/ftrace: Update ftrace_location() for powerpc -mprofile-kernel
Naveen N. Rao wrote: Steven Rostedt wrote: On Tue, 18 Jun 2019 20:17:04 +0530 "Naveen N. Rao" wrote: @@ -1551,7 +1551,7 @@ unsigned long ftrace_location_range(unsigned long start, unsigned long end) key.flags = end;/* overload flags, as it is unsigned long */ for (pg = ftrace_pages_start; pg; pg = pg->next) { - if (end < pg->records[0].ip || + if (end <= pg->records[0].ip || This breaks the algorithm. "end" is inclusive. That is, if you look for a single byte, where "start" and "end" are the same, and it happens to be the first ip on the pg page, it will be skipped, and not found. Thanks. It looks like I should be over-riding ftrace_location() instead. I will update this patch. I think I will have ftrace own the two instruction range, regardless of whether the preceding instruction is a 'mflr r0' or not. This simplifies things and I don't see an issue with it as of now. I will do more testing to confirm. - Naveen --- a/arch/powerpc/kernel/trace/ftrace.c +++ b/arch/powerpc/kernel/trace/ftrace.c @@ -951,6 +951,16 @@ void arch_ftrace_update_code(int command) } #ifdef CONFIG_MPROFILE_KERNEL +/* + * We consider two instructions -- 'mflr r0', 'bl _mcount' -- to be part + * of ftrace. When checking for the first instruction, we want to include + * the next instruction in the range check. + */ +unsigned long ftrace_location(unsigned long ip) +{ + return ftrace_location_range(ip, ip + MCOUNT_INSN_SIZE); +} + /* Returns 1 if we patched in the mflr */ static int __ftrace_make_call_prep(struct dyn_ftrace *rec) { diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 21d8e201ee80..122e2bb4a739 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1573,7 +1573,7 @@ unsigned long ftrace_location_range(unsigned long start, unsigned long end) * the function tracer. It checks the ftrace internal tables to * determine if the address belongs or not. */ -unsigned long ftrace_location(unsigned long ip) +unsigned long __weak ftrace_location(unsigned long ip) { return ftrace_location_range(ip, ip); }
Re: [PATCH 5/7] powerpc/ftrace: Update ftrace_location() for powerpc -mprofile-kernel
Steven Rostedt wrote: On Tue, 18 Jun 2019 20:17:04 +0530 "Naveen N. Rao" wrote: @@ -1551,7 +1551,7 @@ unsigned long ftrace_location_range(unsigned long start, unsigned long end) key.flags = end;/* overload flags, as it is unsigned long */ for (pg = ftrace_pages_start; pg; pg = pg->next) { - if (end < pg->records[0].ip || + if (end <= pg->records[0].ip || This breaks the algorithm. "end" is inclusive. That is, if you look for a single byte, where "start" and "end" are the same, and it happens to be the first ip on the pg page, it will be skipped, and not found. Thanks. It looks like I should be over-riding ftrace_location() instead. I will update this patch. - Naveen
[PATCH 5/7] powerpc/ftrace: Update ftrace_location() for powerpc -mprofile-kernel
Now that we are patching the preceding 'mflr r0' instruction with -mprofile-kernel, we need to update ftrace_location[_range]() to recognise that as being part of ftrace. To do this, we make a small change to ftrace_location_range() and convert ftrace_cmp_recs() into a weak function. We implement a custom version of ftrace_cmp_recs() which looks at the instruction preceding the branch to _mcount() and marks that instruction as belonging to ftrace if it is a 'nop' or 'mflr r0'. Signed-off-by: Naveen N. Rao --- arch/powerpc/kernel/trace/ftrace.c | 31 ++ include/linux/ftrace.h | 1 + kernel/trace/ftrace.c | 4 ++-- 3 files changed, 34 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c index 5e2b29808af1..b84046e43207 100644 --- a/arch/powerpc/kernel/trace/ftrace.c +++ b/arch/powerpc/kernel/trace/ftrace.c @@ -951,6 +951,37 @@ void arch_ftrace_update_code(int command) } #ifdef CONFIG_MPROFILE_KERNEL +/* + * We need to check if the previous instruction is a 'nop' or 'mflr r0'. + * If so, we will patch those subsequently and that instruction must be + * considered as part of ftrace. + */ +int ftrace_cmp_recs(const void *a, const void *b) +{ + const struct dyn_ftrace *key = a; + const struct dyn_ftrace *rec = b; + unsigned int op; + + if (key->flags < rec->ip - MCOUNT_INSN_SIZE) + return -1; + if (key->ip >= rec->ip + MCOUNT_INSN_SIZE) + return 1; + + if (key->flags > rec->ip) + return 0; + + /* check the previous instruction */ + if (probe_kernel_read(, (void *)rec->ip - MCOUNT_INSN_SIZE, + sizeof(op))) + /* assume we own it */ + return 0; + + if (op != PPC_INST_NOP && op != PPC_INST_MFLR) + return -1; + + return 0; +} + /* Returns 1 if we patched in the mflr */ static int __ftrace_make_call_prep(struct dyn_ftrace *rec) { diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index fa653a561da5..9941987bf510 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -435,6 +435,7 @@ struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter); int ftrace_update_record(struct dyn_ftrace *rec, int enable); int ftrace_test_record(struct dyn_ftrace *rec, int enable); void ftrace_run_stop_machine(int command); +int ftrace_cmp_recs(const void *a, const void *b); unsigned long ftrace_location(unsigned long ip); unsigned long ftrace_location_range(unsigned long start, unsigned long end); unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 21d8e201ee80..b5c61db0b452 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1517,7 +1517,7 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) } -static int ftrace_cmp_recs(const void *a, const void *b) +int __weak ftrace_cmp_recs(const void *a, const void *b) { const struct dyn_ftrace *key = a; const struct dyn_ftrace *rec = b; @@ -1551,7 +1551,7 @@ unsigned long ftrace_location_range(unsigned long start, unsigned long end) key.flags = end;/* overload flags, as it is unsigned long */ for (pg = ftrace_pages_start; pg; pg = pg->next) { - if (end < pg->records[0].ip || + if (end <= pg->records[0].ip || start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE)) continue; rec = bsearch(, pg->records, pg->index, -- 2.22.0
[PATCH 6/7] kprobes/ftrace: Use ftrace_location() when [dis]arming probes
Ftrace location could include more than a single instruction in case of some architectures (powerpc64, for now). In this case, kprobe is permitted on any of those instructions, and uses ftrace infrastructure for functioning. However, [dis]arm_kprobe_ftrace() uses the kprobe address when setting up ftrace filter IP. This won't work if the address points to any instruction apart from the one that has a branch to _mcount(). To resolve this, have [dis]arm_kprobe_ftrace() use ftrace_function() to identify the filter IP. Signed-off-by: Naveen N. Rao --- kernel/kprobes.c | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 445337c107e0..282ee704e2d8 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -978,10 +978,10 @@ static int prepare_kprobe(struct kprobe *p) /* Caller must lock kprobe_mutex */ static int arm_kprobe_ftrace(struct kprobe *p) { + unsigned long ftrace_ip = ftrace_location((unsigned long)p->addr); int ret = 0; - ret = ftrace_set_filter_ip(_ftrace_ops, - (unsigned long)p->addr, 0, 0); + ret = ftrace_set_filter_ip(_ftrace_ops, ftrace_ip, 0, 0); if (ret) { pr_debug("Failed to arm kprobe-ftrace at %pS (%d)\n", p->addr, ret); @@ -1005,13 +1005,14 @@ static int arm_kprobe_ftrace(struct kprobe *p) * non-empty filter_hash for IPMODIFY ops, we're safe from an accidental * empty filter_hash which would undesirably trace all functions. */ - ftrace_set_filter_ip(_ftrace_ops, (unsigned long)p->addr, 1, 0); + ftrace_set_filter_ip(_ftrace_ops, ftrace_ip, 1, 0); return ret; } /* Caller must lock kprobe_mutex */ static int disarm_kprobe_ftrace(struct kprobe *p) { + unsigned long ftrace_ip = ftrace_location((unsigned long)p->addr); int ret = 0; if (kprobe_ftrace_enabled == 1) { @@ -1022,8 +1023,7 @@ static int disarm_kprobe_ftrace(struct kprobe *p) kprobe_ftrace_enabled--; - ret = ftrace_set_filter_ip(_ftrace_ops, - (unsigned long)p->addr, 1, 0); + ret = ftrace_set_filter_ip(_ftrace_ops, ftrace_ip, 1, 0); WARN_ONCE(ret < 0, "Failed to disarm kprobe-ftrace at %pS (%d)\n", p->addr, ret); return ret; -- 2.22.0
[PATCH 0/7] powerpc/ftrace: Patch out -mprofile-kernel instructions
Changes since RFC: - Patches 1 and 2: No changes - Patch 3: rename function to ftrace_replace_code_rec() to signify that it acts on a ftrace record. - Patch 4: numerous small changes, including those suggested by Nick Piggin. - Patch 4: additionally handle scenario where __ftrace_make_call() can be invoked without having called __ftrace_make_call_prep(), when a kernel module is loaded while function tracing is active. - Patch 5 to 7: new, to have ftrace claim ownership of two instructions, and to have kprobes work properly with this. -- On powerpc64, -mprofile-kernel results in two instructions being emitted: 'mflr r0' and 'bl _mcount'. So far, we were only nop'ing out the branch to _mcount(). This series implements an approach to also nop out the preceding mflr. - Naveen Naveen N. Rao (7): ftrace: Expose flags used for ftrace_replace_code() x86/ftrace: Fix use of flags in ftrace_replace_code() ftrace: Expose __ftrace_replace_code() powerpc/ftrace: Additionally nop out the preceding mflr with -mprofile-kernel powerpc/ftrace: Update ftrace_location() for powerpc -mprofile-kernel kprobes/ftrace: Use ftrace_location() when [dis]arming probes powerpc/kprobes: Allow probing on any ftrace address arch/powerpc/kernel/kprobes-ftrace.c | 30 +++ arch/powerpc/kernel/trace/ftrace.c | 272 --- arch/x86/kernel/ftrace.c | 3 +- include/linux/ftrace.h | 7 + kernel/kprobes.c | 10 +- kernel/trace/ftrace.c| 17 +- 6 files changed, 300 insertions(+), 39 deletions(-) -- 2.22.0
[PATCH 4/7] powerpc/ftrace: Additionally nop out the preceding mflr with -mprofile-kernel
With -mprofile-kernel, gcc emits 'mflr r0', followed by 'bl _mcount' to enable function tracing and profiling. So far, with dynamic ftrace, we used to only patch out the branch to _mcount(). However, mflr is executed by the branch unit that can only execute one per cycle on POWER9 and shared with branches, so it would be nice to avoid it where possible. We cannot simply nop out the mflr either. When enabling function tracing, there can be a race if tracing is enabled when some thread was interrupted after executing a nop'ed out mflr. In this case, the thread would execute the now-patched-in branch to _mcount() without having executed the preceding mflr. To solve this, we now enable function tracing in 2 steps: patch in the mflr instruction, use synchronize_rcu_tasks() to ensure all existing threads make progress, and then patch in the branch to _mcount(). We override ftrace_replace_code() with a powerpc64 variant for this purpose. Suggested-by: Nicholas Piggin Reviewed-by: Nicholas Piggin Signed-off-by: Naveen N. Rao --- arch/powerpc/kernel/trace/ftrace.c | 241 ++--- 1 file changed, 219 insertions(+), 22 deletions(-) diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c index 517662a56bdc..5e2b29808af1 100644 --- a/arch/powerpc/kernel/trace/ftrace.c +++ b/arch/powerpc/kernel/trace/ftrace.c @@ -125,7 +125,7 @@ __ftrace_make_nop(struct module *mod, { unsigned long entry, ptr, tramp; unsigned long ip = rec->ip; - unsigned int op, pop; + unsigned int op; /* read where this goes */ if (probe_kernel_read(, (void *)ip, sizeof(int))) { @@ -160,8 +160,6 @@ __ftrace_make_nop(struct module *mod, #ifdef CONFIG_MPROFILE_KERNEL /* When using -mkernel_profile there is no load to jump over */ - pop = PPC_INST_NOP; - if (probe_kernel_read(, (void *)(ip - 4), 4)) { pr_err("Fetching instruction at %lx failed.\n", ip - 4); return -EFAULT; @@ -169,26 +167,23 @@ __ftrace_make_nop(struct module *mod, /* We expect either a mflr r0, or a std r0, LRSAVE(r1) */ if (op != PPC_INST_MFLR && op != PPC_INST_STD_LR) { - pr_err("Unexpected instruction %08x around bl _mcount\n", op); + pr_err("Unexpected instruction %08x before bl _mcount\n", op); return -EINVAL; } -#else - /* -* Our original call site looks like: -* -* bl -* ld r2,XX(r1) -* -* Milton Miller pointed out that we can not simply nop the branch. -* If a task was preempted when calling a trace function, the nops -* will remove the way to restore the TOC in r2 and the r2 TOC will -* get corrupted. -* -* Use a b +8 to jump over the load. -*/ - pop = PPC_INST_BRANCH | 8; /* b +8 */ + /* We should patch out the bl to _mcount first */ + if (patch_instruction((unsigned int *)ip, PPC_INST_NOP)) { + pr_err("Patching NOP failed.\n"); + return -EPERM; + } + /* then, nop out the preceding 'mflr r0' as an optimization */ + if (op == PPC_INST_MFLR && + patch_instruction((unsigned int *)(ip - 4), PPC_INST_NOP)) { + pr_err("Patching NOP failed.\n"); + return -EPERM; + } +#else /* * Check what is in the next instruction. We can see ld r2,40(r1), but * on first pass after boot we will see mflr r0. @@ -202,12 +197,25 @@ __ftrace_make_nop(struct module *mod, pr_err("Expected %08x found %08x\n", PPC_INST_LD_TOC, op); return -EINVAL; } -#endif /* CONFIG_MPROFILE_KERNEL */ - if (patch_instruction((unsigned int *)ip, pop)) { + /* +* Our original call site looks like: +* +* bl +* ld r2,XX(r1) +* +* Milton Miller pointed out that we can not simply nop the branch. +* If a task was preempted when calling a trace function, the nops +* will remove the way to restore the TOC in r2 and the r2 TOC will +* get corrupted. +* +* Use a b +8 to jump over the load. +*/ + if (patch_instruction((unsigned int *)ip, PPC_INST_BRANCH | 8)) { pr_err("Patching NOP failed.\n"); return -EPERM; } +#endif /* CONFIG_MPROFILE_KERNEL */ return 0; } @@ -421,6 +429,26 @@ static int __ftrace_make_nop_kernel(struct dyn_ftrace *rec, unsigned long addr) return -EPERM; } +#ifdef CONFIG_MPROFILE_KERNEL + /* Nop out the preceding 'mflr r0' as an optimization */ + if (probe_kernel_read(, (void *)(ip - 4), 4)) { + pr_err("Fetching instruction at %lx failed.\n", ip - 4); + return -E
[PATCH 7/7] powerpc/kprobes: Allow probing on any ftrace address
With KPROBES_ON_FTRACE, kprobe is allowed to be inserted on instructions that branch to _mcount (referred to as ftrace location). With -mprofile-kernel, we now include the preceding 'mflr r0' as being part of the ftrace location. However, by default, probing on an instruction that is not actually the branch to _mcount() is prohibited, as that is considered to not be at an instruction boundary. This is not the case on powerpc, so allow the same by overriding arch_check_ftrace_location() In addition, we update kprobe_ftrace_handler() to detect this scenarios and to pass the proper nip to the pre and post probe handlers. Signed-off-by: Naveen N. Rao --- arch/powerpc/kernel/kprobes-ftrace.c | 30 1 file changed, 30 insertions(+) diff --git a/arch/powerpc/kernel/kprobes-ftrace.c b/arch/powerpc/kernel/kprobes-ftrace.c index 972cb28174b2..6a0bd3c16cb6 100644 --- a/arch/powerpc/kernel/kprobes-ftrace.c +++ b/arch/powerpc/kernel/kprobes-ftrace.c @@ -12,14 +12,34 @@ #include #include +/* + * With -mprofile-kernel, we patch two instructions -- the branch to _mcount + * as well as the preceding 'mflr r0'. Both these instructions are claimed + * by ftrace and we should allow probing on either instruction. + */ +int arch_check_ftrace_location(struct kprobe *p) +{ + if (ftrace_location((unsigned long)p->addr)) + p->flags |= KPROBE_FLAG_FTRACE; + return 0; +} + /* Ftrace callback handler for kprobes */ void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip, struct ftrace_ops *ops, struct pt_regs *regs) { struct kprobe *p; + int mflr_kprobe = 0; struct kprobe_ctlblk *kcb; p = get_kprobe((kprobe_opcode_t *)nip); + if (unlikely(!p)) { + p = get_kprobe((kprobe_opcode_t *)(nip - MCOUNT_INSN_SIZE)); + if (!p) + return; + mflr_kprobe = 1; + } + if (unlikely(!p) || kprobe_disabled(p)) return; @@ -33,6 +53,9 @@ void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip, */ regs->nip -= MCOUNT_INSN_SIZE; + if (mflr_kprobe) + regs->nip -= MCOUNT_INSN_SIZE; + __this_cpu_write(current_kprobe, p); kcb->kprobe_status = KPROBE_HIT_ACTIVE; if (!p->pre_handler || !p->pre_handler(p, regs)) { @@ -45,6 +68,8 @@ void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip, kcb->kprobe_status = KPROBE_HIT_SSDONE; p->post_handler(p, regs, 0); } + if (mflr_kprobe) + regs->nip += MCOUNT_INSN_SIZE; } /* * If pre_handler returns !0, it changes regs->nip. We have to @@ -57,6 +82,11 @@ NOKPROBE_SYMBOL(kprobe_ftrace_handler); int arch_prepare_kprobe_ftrace(struct kprobe *p) { + if ((unsigned long)p->addr & 0x03) { + printk("Attempt to register kprobe at an unaligned address\n"); + return -EILSEQ; + } + p->ainsn.insn = NULL; p->ainsn.boostable = -1; return 0; -- 2.22.0
[PATCH 3/7] ftrace: Expose __ftrace_replace_code()
While over-riding ftrace_replace_code(), we still want to reuse the existing __ftrace_replace_code() function. Rename the function and make it available for other kernel code. Signed-off-by: Naveen N. Rao --- include/linux/ftrace.h | 1 + kernel/trace/ftrace.c | 8 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index e97789c95c4e..fa653a561da5 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -456,6 +456,7 @@ ftrace_set_early_filter(struct ftrace_ops *ops, char *buf, int enable); /* defined in arch */ extern int ftrace_ip_converted(unsigned long ip); extern int ftrace_dyn_arch_init(void); +extern int ftrace_replace_code_rec(struct dyn_ftrace *rec, int enable); extern void ftrace_replace_code(int enable); extern int ftrace_update_ftrace_func(ftrace_func_t func); extern void ftrace_caller(void); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 5710a6b3edc1..21d8e201ee80 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2351,8 +2351,8 @@ unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec) return (unsigned long)FTRACE_ADDR; } -static int -__ftrace_replace_code(struct dyn_ftrace *rec, int enable) +int +ftrace_replace_code_rec(struct dyn_ftrace *rec, int enable) { unsigned long ftrace_old_addr; unsigned long ftrace_addr; @@ -2403,7 +2403,7 @@ void __weak ftrace_replace_code(int mod_flags) if (rec->flags & FTRACE_FL_DISABLED) continue; - failed = __ftrace_replace_code(rec, enable); + failed = ftrace_replace_code_rec(rec, enable); if (failed) { ftrace_bug(failed, rec); /* Stop processing */ @@ -5827,7 +5827,7 @@ void ftrace_module_enable(struct module *mod) rec->flags = cnt; if (ftrace_start_up && cnt) { - int failed = __ftrace_replace_code(rec, 1); + int failed = ftrace_replace_code_rec(rec, 1); if (failed) { ftrace_bug(failed, rec); goto out_loop; -- 2.22.0
[PATCH 1/7] ftrace: Expose flags used for ftrace_replace_code()
Since ftrace_replace_code() is a __weak function and can be overridden, we need to expose the flags that can be set. So, move the flags enum to the header file. Reviewed-by: Steven Rostedt (VMware) Signed-off-by: Naveen N. Rao --- include/linux/ftrace.h | 5 + kernel/trace/ftrace.c | 5 - 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 25e2995d4a4c..e97789c95c4e 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -162,6 +162,11 @@ enum { FTRACE_OPS_FL_TRACE_ARRAY = 1 << 15, }; +enum { + FTRACE_MODIFY_ENABLE_FL = (1 << 0), + FTRACE_MODIFY_MAY_SLEEP_FL = (1 << 1), +}; + #ifdef CONFIG_DYNAMIC_FTRACE /* The hash used to know what functions callbacks trace */ struct ftrace_ops_hash { diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 38277af44f5c..5710a6b3edc1 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -75,11 +75,6 @@ #define INIT_OPS_HASH(opsname) #endif -enum { - FTRACE_MODIFY_ENABLE_FL = (1 << 0), - FTRACE_MODIFY_MAY_SLEEP_FL = (1 << 1), -}; - struct ftrace_ops ftrace_list_end __read_mostly = { .func = ftrace_stub, .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB, -- 2.22.0
[PATCH 2/7] x86/ftrace: Fix use of flags in ftrace_replace_code()
In commit a0572f687fb3c ("ftrace: Allow ftrace_replace_code() to be schedulable), the generic ftrace_replace_code() function was modified to accept a flags argument in place of a single 'enable' flag. However, the x86 version of this function was not updated. Fix the same. Fixes: a0572f687fb3c ("ftrace: Allow ftrace_replace_code() to be schedulable") Signed-off-by: Naveen N. Rao --- arch/x86/kernel/ftrace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 0927bb158ffc..f34005a17051 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -573,8 +573,9 @@ static void run_sync(void) local_irq_disable(); } -void ftrace_replace_code(int enable) +void ftrace_replace_code(int mod_flags) { + int enable = mod_flags & FTRACE_MODIFY_ENABLE_FL; struct ftrace_rec_iter *iter; struct dyn_ftrace *rec; const char *report = "adding breakpoints"; -- 2.22.0
Re: [PATCH] Powerpc/Watchpoint: Restore nvgprs while returning from exception
Ravi Bangoria wrote: Powerpc hw triggers watchpoint before executing the instruction. To make trigger-after-execute behavior, kernel emulates the instruction. If the instruction is 'load something into non- volatile register', exception handler should restore emulated register state while returning back, otherwise there will be register state corruption. Ex, Adding a watchpoint on a list can corrput the list: # cat /proc/kallsyms | grep kthread_create_list c121c8b8 d kthread_create_list Add watchpoint on kthread_create_list->next: # perf record -e mem:0xc121c8c0 Run some workload such that new kthread gets invoked. Ex, I just logged out from console: list_add corruption. next->prev should be prev (c1214e00), \ but was c121c8b8. (next=c121c8b8). WARNING: CPU: 59 PID: 309 at lib/list_debug.c:25 __list_add_valid+0xb4/0xc0 CPU: 59 PID: 309 Comm: kworker/59:0 Kdump: loaded Not tainted 5.1.0-rc7+ #69 ... NIP __list_add_valid+0xb4/0xc0 LR __list_add_valid+0xb0/0xc0 Call Trace: __list_add_valid+0xb0/0xc0 (unreliable) __kthread_create_on_node+0xe0/0x260 kthread_create_on_node+0x34/0x50 create_worker+0xe8/0x260 worker_thread+0x444/0x560 kthread+0x160/0x1a0 ret_from_kernel_thread+0x5c/0x70 Signed-off-by: Ravi Bangoria --- arch/powerpc/kernel/exceptions-64s.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) Awesome catch - this one has had a glorious run... Fixes: 5aae8a5370802 ("powerpc, hw_breakpoints: Implement hw_breakpoints for 64-bit server processors") Reviewed-by: Naveen N. Rao - Naveen
Re: [RFC PATCH 2/4] x86/ftrace: Fix use of flags in ftrace_replace_code()
Hi Steven, Steven Rostedt wrote: On Mon, 20 May 2019 09:13:20 -0400 Steven Rostedt wrote: > I haven't yet tested this patch on x86, but this looked wrong so sending > this as a RFC. This code has been through a bit of updates, and I need to go through and clean it up. I'll have to take a look and convert "int" to "bool" so that "enable" is not confusing. Thanks, I think I'll try to do a clean up first, and then this patch shouldn't "look wrong" after that. I'm going to apply the attached two patches. There may be some conflicts between yours here and these, but nothing that Linus can't figure out. Do you feel more comfortable with this code, if these patches are applied? Thanks, that definitely helps make things clearer. A very small nit from your first patch -- it would be good to also convert the calls to ftrace_check_record() to use 'true' or 'false' for the 'update' field. I will test my series in more detail and post a v1. - Naveen
Re: [RFC PATCH 4/4] powerpc/ftrace: Additionally nop out the preceding mflr with -mprofile-kernel
Nicholas Piggin wrote: Naveen N. Rao's on May 18, 2019 5:02 am: With -mprofile-kernel, gcc emits 'mflr r0', followed by 'bl _mcount' to enable function tracing and profiling. So far, with dynamic ftrace, we used to only patch out the branch to _mcount(). However, Nick Piggin points out that "mflr is executed by the branch unit that can only execute one per cycle on POWER9 and shared with branches, so it would be nice to avoid it where possible." We cannot simply nop out the mflr either. Michael Ellerman pointed out that when enabling function tracing, there can be a race if tracing is enabled when some thread was interrupted after executing a nop'ed out mflr. In this case, the thread would execute the now-patched-in branch to _mcount() without having executed the preceding mflr. To solve this, we now enable function tracing in 2 steps: patch in the mflr instruction, use synchronize_rcu_tasks() to ensure all existing threads make progress, and then patch in the branch to _mcount(). We override ftrace_replace_code() with a powerpc64 variant for this purpose. Signed-off-by: Nicholas Piggin Signed-off-by: Naveen N. Rao Nice! Thanks for doing a real patch. You needn't add my SOB there: my hack was obviously garbage :) Suggested-by if anything, then for clarity of changelog you can write the motivation directly rather than quote me. Thanks, I meant to call out the fact that I had added your SOB before sending the patch, but missed doing so. Your patch was perfectly fine ;) I don't know the ftrace subsystem well, but the powerpc instructions and patching sequence appears to match what we agreed is the right way to go. As a suggestion, I would perhaps add most of information from the second and third paragraphs of the changelog into comments (and also explain that the lone mflr r0 is harmless). But otherwise it looks good Reviewed-by: Nicholas Piggin Thanks, I will incorporate those changes. - Naveen
[RFC PATCH 4/4] powerpc/ftrace: Additionally nop out the preceding mflr with -mprofile-kernel
With -mprofile-kernel, gcc emits 'mflr r0', followed by 'bl _mcount' to enable function tracing and profiling. So far, with dynamic ftrace, we used to only patch out the branch to _mcount(). However, Nick Piggin points out that "mflr is executed by the branch unit that can only execute one per cycle on POWER9 and shared with branches, so it would be nice to avoid it where possible." We cannot simply nop out the mflr either. Michael Ellerman pointed out that when enabling function tracing, there can be a race if tracing is enabled when some thread was interrupted after executing a nop'ed out mflr. In this case, the thread would execute the now-patched-in branch to _mcount() without having executed the preceding mflr. To solve this, we now enable function tracing in 2 steps: patch in the mflr instruction, use synchronize_rcu_tasks() to ensure all existing threads make progress, and then patch in the branch to _mcount(). We override ftrace_replace_code() with a powerpc64 variant for this purpose. Signed-off-by: Nicholas Piggin Signed-off-by: Naveen N. Rao --- arch/powerpc/kernel/trace/ftrace.c | 188 + 1 file changed, 166 insertions(+), 22 deletions(-) diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c index 517662a56bdc..5c3523c3b259 100644 --- a/arch/powerpc/kernel/trace/ftrace.c +++ b/arch/powerpc/kernel/trace/ftrace.c @@ -125,7 +125,7 @@ __ftrace_make_nop(struct module *mod, { unsigned long entry, ptr, tramp; unsigned long ip = rec->ip; - unsigned int op, pop; + unsigned int op; /* read where this goes */ if (probe_kernel_read(, (void *)ip, sizeof(int))) { @@ -160,8 +160,6 @@ __ftrace_make_nop(struct module *mod, #ifdef CONFIG_MPROFILE_KERNEL /* When using -mkernel_profile there is no load to jump over */ - pop = PPC_INST_NOP; - if (probe_kernel_read(, (void *)(ip - 4), 4)) { pr_err("Fetching instruction at %lx failed.\n", ip - 4); return -EFAULT; @@ -169,26 +167,22 @@ __ftrace_make_nop(struct module *mod, /* We expect either a mflr r0, or a std r0, LRSAVE(r1) */ if (op != PPC_INST_MFLR && op != PPC_INST_STD_LR) { - pr_err("Unexpected instruction %08x around bl _mcount\n", op); + pr_err("Unexpected instruction %08x before bl _mcount\n", op); return -EINVAL; } -#else - /* -* Our original call site looks like: -* -* bl -* ld r2,XX(r1) -* -* Milton Miller pointed out that we can not simply nop the branch. -* If a task was preempted when calling a trace function, the nops -* will remove the way to restore the TOC in r2 and the r2 TOC will -* get corrupted. -* -* Use a b +8 to jump over the load. -*/ - pop = PPC_INST_BRANCH | 8; /* b +8 */ + /* We should patch out the bl to _mcount first */ + if (patch_instruction((unsigned int *)ip, PPC_INST_NOP)) { + pr_err("Patching NOP failed.\n"); + return -EPERM; + } + if (op == PPC_INST_MFLR && + patch_instruction((unsigned int *)(ip - 4), PPC_INST_NOP)) { + pr_err("Patching NOP failed.\n"); + return -EPERM; + } +#else /* * Check what is in the next instruction. We can see ld r2,40(r1), but * on first pass after boot we will see mflr r0. @@ -202,12 +196,25 @@ __ftrace_make_nop(struct module *mod, pr_err("Expected %08x found %08x\n", PPC_INST_LD_TOC, op); return -EINVAL; } -#endif /* CONFIG_MPROFILE_KERNEL */ - if (patch_instruction((unsigned int *)ip, pop)) { + /* +* Our original call site looks like: +* +* bl +* ld r2,XX(r1) +* +* Milton Miller pointed out that we can not simply nop the branch. +* If a task was preempted when calling a trace function, the nops +* will remove the way to restore the TOC in r2 and the r2 TOC will +* get corrupted. +* +* Use a b +8 to jump over the load. +*/ + if (patch_instruction((unsigned int *)ip, PPC_INST_BRANCH | 8)) { pr_err("Patching NOP failed.\n"); return -EPERM; } +#endif /* CONFIG_MPROFILE_KERNEL */ return 0; } @@ -421,6 +428,25 @@ static int __ftrace_make_nop_kernel(struct dyn_ftrace *rec, unsigned long addr) return -EPERM; } +#ifdef CONFIG_MPROFILE_KERNEL + if (probe_kernel_read(, (void *)(ip - 4), 4)) { + pr_err("Fetching instruction at %lx failed.\n", ip - 4); + return -EFAULT; + } + + /* We expect either a mflr r0, or a std r0, LRSAVE(r1) */ + if (
[RFC PATCH 1/4] ftrace: Expose flags used for ftrace_replace_code()
Since ftrace_replace_code() is a __weak function and can be overridden, we need to expose the flags that can be set. So, move the flags enum to the header file. Signed-off-by: Naveen N. Rao --- include/linux/ftrace.h | 5 + kernel/trace/ftrace.c | 5 - 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 20899919ead8..835e761f63b0 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -162,6 +162,11 @@ enum { FTRACE_OPS_FL_TRACE_ARRAY = 1 << 15, }; +enum { + FTRACE_MODIFY_ENABLE_FL = (1 << 0), + FTRACE_MODIFY_MAY_SLEEP_FL = (1 << 1), +}; + #ifdef CONFIG_DYNAMIC_FTRACE /* The hash used to know what functions callbacks trace */ struct ftrace_ops_hash { diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b920358dd8f7..38c15cd27fc4 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -78,11 +78,6 @@ #define ASSIGN_OPS_HASH(opsname, val) #endif -enum { - FTRACE_MODIFY_ENABLE_FL = (1 << 0), - FTRACE_MODIFY_MAY_SLEEP_FL = (1 << 1), -}; - struct ftrace_ops ftrace_list_end __read_mostly = { .func = ftrace_stub, .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB, -- 2.21.0