[tip:perf/core] perf: Avoid horrible stack usage

2015-01-14 Thread tip-bot for Peter Zijlstra (Intel)
Commit-ID:  86038c5ea81b519a8a1fcfcd5e4599aab0cdd119
Gitweb: http://git.kernel.org/tip/86038c5ea81b519a8a1fcfcd5e4599aab0cdd119
Author: Peter Zijlstra (Intel) 
AuthorDate: Tue, 16 Dec 2014 12:47:34 +0100
Committer:  Ingo Molnar 
CommitDate: Wed, 14 Jan 2015 15:11:45 +0100

perf: Avoid horrible stack usage

Both Linus (most recent) and Steve (a while ago) reported that perf
related callbacks have massive stack bloat.

The problem is that software events need a pt_regs in order to
properly report the event location and unwind stack. And because we
could not assume one was present we allocated one on stack and filled
it with minimal bits required for operation.

Now, pt_regs is quite large, so this is undesirable. Furthermore it
turns out that most sites actually have a pt_regs pointer available,
making this even more onerous, as the stack space is pointless waste.

This patch addresses the problem by observing that software events
have well defined nesting semantics, therefore we can use static
per-cpu storage instead of on-stack.

Linus made the further observation that all but the scheduler callers
of perf_sw_event() have a pt_regs available, so we change the regular
perf_sw_event() to require a valid pt_regs (where it used to be
optional) and add perf_sw_event_sched() for the scheduler.

We have a scheduler specific call instead of a more generic _noregs()
like construct because we can assume non-recursion from the scheduler
and thereby simplify the code further (_noregs would have to put the
recursion context call inline in order to assertain which __perf_regs
element to use).

One last note on the implementation of perf_trace_buf_prepare(); we
allow .regs = NULL for those cases where we already have a pt_regs
pointer available and do not need another.

Reported-by: Linus Torvalds 
Reported-by: Steven Rostedt 
Signed-off-by: Peter Zijlstra (Intel) 
Cc: Arnaldo Carvalho de Melo 
Cc: Javi Merino 
Cc: Linus Torvalds 
Cc: Mathieu Desnoyers 
Cc: Oleg Nesterov 
Cc: Paul Mackerras 
Cc: Petr Mladek 
Cc: Steven Rostedt 
Cc: Tom Zanussi 
Cc: Vaibhav Nagarnaik 
Link: 
http://lkml.kernel.org/r/20141216115041.gw3...@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar 
---
 include/linux/ftrace_event.h|  2 +-
 include/linux/perf_event.h  | 28 +---
 include/trace/ftrace.h  |  7 ---
 kernel/events/core.c| 23 +--
 kernel/sched/core.c |  2 +-
 kernel/trace/trace_event_perf.c |  4 +++-
 kernel/trace/trace_kprobe.c |  4 ++--
 kernel/trace/trace_syscalls.c   |  4 ++--
 kernel/trace/trace_uprobe.c |  2 +-
 9 files changed, 52 insertions(+), 24 deletions(-)

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 0bebb5c..d36f68b 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -595,7 +595,7 @@ extern int  ftrace_profile_set_filter(struct perf_event 
*event, int event_id,
 char *filter_str);
 extern void ftrace_profile_free_filter(struct perf_event *event);
 extern void *perf_trace_buf_prepare(int size, unsigned short type,
-   struct pt_regs *regs, int *rctxp);
+   struct pt_regs **regs, int *rctxp);
 
 static inline void
 perf_trace_buf_submit(void *raw_data, int size, int rctx, u64 addr,
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 4f7a61c..3a7bd80 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -665,6 +665,7 @@ static inline int is_software_event(struct perf_event 
*event)
 
 extern struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
 
+extern void ___perf_sw_event(u32, u64, struct pt_regs *, u64);
 extern void __perf_sw_event(u32, u64, struct pt_regs *, u64);
 
 #ifndef perf_arch_fetch_caller_regs
@@ -689,14 +690,25 @@ static inline void perf_fetch_caller_regs(struct pt_regs 
*regs)
 static __always_inline void
 perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
 {
-   struct pt_regs hot_regs;
+   if (static_key_false(_swevent_enabled[event_id]))
+   __perf_sw_event(event_id, nr, regs, addr);
+}
+
+DECLARE_PER_CPU(struct pt_regs, __perf_regs[4]);
 
+/*
+ * 'Special' version for the scheduler, it hard assumes no recursion,
+ * which is guaranteed by us not actually scheduling inside other swevents
+ * because those disable preemption.
+ */
+static __always_inline void
+perf_sw_event_sched(u32 event_id, u64 nr, u64 addr)
+{
if (static_key_false(_swevent_enabled[event_id])) {
-   if (!regs) {
-   perf_fetch_caller_regs(_regs);
-   regs = _regs;
-   }
-   __perf_sw_event(event_id, nr, regs, addr);
+   struct pt_regs *regs = this_cpu_ptr(&__perf_regs[0]);
+
+   perf_fetch_caller_regs(regs);
+   ___perf_sw_event(event_id, nr, regs, addr);
}
 

[tip:perf/core] perf: Avoid horrible stack usage

2015-01-14 Thread tip-bot for Peter Zijlstra (Intel)
Commit-ID:  86038c5ea81b519a8a1fcfcd5e4599aab0cdd119
Gitweb: http://git.kernel.org/tip/86038c5ea81b519a8a1fcfcd5e4599aab0cdd119
Author: Peter Zijlstra (Intel) pet...@infradead.org
AuthorDate: Tue, 16 Dec 2014 12:47:34 +0100
Committer:  Ingo Molnar mi...@kernel.org
CommitDate: Wed, 14 Jan 2015 15:11:45 +0100

perf: Avoid horrible stack usage

Both Linus (most recent) and Steve (a while ago) reported that perf
related callbacks have massive stack bloat.

The problem is that software events need a pt_regs in order to
properly report the event location and unwind stack. And because we
could not assume one was present we allocated one on stack and filled
it with minimal bits required for operation.

Now, pt_regs is quite large, so this is undesirable. Furthermore it
turns out that most sites actually have a pt_regs pointer available,
making this even more onerous, as the stack space is pointless waste.

This patch addresses the problem by observing that software events
have well defined nesting semantics, therefore we can use static
per-cpu storage instead of on-stack.

Linus made the further observation that all but the scheduler callers
of perf_sw_event() have a pt_regs available, so we change the regular
perf_sw_event() to require a valid pt_regs (where it used to be
optional) and add perf_sw_event_sched() for the scheduler.

We have a scheduler specific call instead of a more generic _noregs()
like construct because we can assume non-recursion from the scheduler
and thereby simplify the code further (_noregs would have to put the
recursion context call inline in order to assertain which __perf_regs
element to use).

One last note on the implementation of perf_trace_buf_prepare(); we
allow .regs = NULL for those cases where we already have a pt_regs
pointer available and do not need another.

Reported-by: Linus Torvalds torva...@linux-foundation.org
Reported-by: Steven Rostedt rost...@goodmis.org
Signed-off-by: Peter Zijlstra (Intel) pet...@infradead.org
Cc: Arnaldo Carvalho de Melo a...@kernel.org
Cc: Javi Merino javi.mer...@arm.com
Cc: Linus Torvalds torva...@linux-foundation.org
Cc: Mathieu Desnoyers mathieu.desnoy...@efficios.com
Cc: Oleg Nesterov o...@redhat.com
Cc: Paul Mackerras pau...@samba.org
Cc: Petr Mladek pmla...@suse.cz
Cc: Steven Rostedt rost...@goodmis.org
Cc: Tom Zanussi tom.zanu...@linux.intel.com
Cc: Vaibhav Nagarnaik vnagarn...@google.com
Link: 
http://lkml.kernel.org/r/20141216115041.gw3...@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar mi...@kernel.org
---
 include/linux/ftrace_event.h|  2 +-
 include/linux/perf_event.h  | 28 +---
 include/trace/ftrace.h  |  7 ---
 kernel/events/core.c| 23 +--
 kernel/sched/core.c |  2 +-
 kernel/trace/trace_event_perf.c |  4 +++-
 kernel/trace/trace_kprobe.c |  4 ++--
 kernel/trace/trace_syscalls.c   |  4 ++--
 kernel/trace/trace_uprobe.c |  2 +-
 9 files changed, 52 insertions(+), 24 deletions(-)

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 0bebb5c..d36f68b 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -595,7 +595,7 @@ extern int  ftrace_profile_set_filter(struct perf_event 
*event, int event_id,
 char *filter_str);
 extern void ftrace_profile_free_filter(struct perf_event *event);
 extern void *perf_trace_buf_prepare(int size, unsigned short type,
-   struct pt_regs *regs, int *rctxp);
+   struct pt_regs **regs, int *rctxp);
 
 static inline void
 perf_trace_buf_submit(void *raw_data, int size, int rctx, u64 addr,
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 4f7a61c..3a7bd80 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -665,6 +665,7 @@ static inline int is_software_event(struct perf_event 
*event)
 
 extern struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
 
+extern void ___perf_sw_event(u32, u64, struct pt_regs *, u64);
 extern void __perf_sw_event(u32, u64, struct pt_regs *, u64);
 
 #ifndef perf_arch_fetch_caller_regs
@@ -689,14 +690,25 @@ static inline void perf_fetch_caller_regs(struct pt_regs 
*regs)
 static __always_inline void
 perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
 {
-   struct pt_regs hot_regs;
+   if (static_key_false(perf_swevent_enabled[event_id]))
+   __perf_sw_event(event_id, nr, regs, addr);
+}
+
+DECLARE_PER_CPU(struct pt_regs, __perf_regs[4]);
 
+/*
+ * 'Special' version for the scheduler, it hard assumes no recursion,
+ * which is guaranteed by us not actually scheduling inside other swevents
+ * because those disable preemption.
+ */
+static __always_inline void
+perf_sw_event_sched(u32 event_id, u64 nr, u64 addr)
+{
if (static_key_false(perf_swevent_enabled[event_id])) {
-   if (!regs) {
-