[PATCH v2] tracing: Add sched_prepare_exec tracepoint

2024-04-11 Thread Marco Elver
Add "sched_prepare_exec" tracepoint, which is run right after the point
of no return but before the current task assumes its new exec identity.

Unlike the tracepoint "sched_process_exec", the "sched_prepare_exec"
tracepoint runs before flushing the old exec, i.e. while the task still
has the original state (such as original MM), but when the new exec
either succeeds or crashes (but never returns to the original exec).

Being able to trace this event can be helpful in a number of use cases:

  * allowing tracing eBPF programs access to the original MM on exec,
before current->mm is replaced;
  * counting exec in the original task (via perf event);
  * profiling flush time ("sched_prepare_exec" to "sched_process_exec").

Example of tracing output:

 $ cat /sys/kernel/debug/tracing/trace_pipe
<...>-379  [003] .  179.626921: sched_prepare_exec: 
interp=/usr/bin/sshd filename=/usr/bin/sshd pid=379 comm=sshd
<...>-381  [002] .  180.048580: sched_prepare_exec: interp=/bin/bash 
filename=/bin/bash pid=381 comm=sshd
<...>-385  [001] .  180.068277: sched_prepare_exec: interp=/usr/bin/tty 
filename=/usr/bin/tty pid=385 comm=bash
<...>-389  [006] .  192.020147: sched_prepare_exec: 
interp=/usr/bin/dmesg filename=/usr/bin/dmesg pid=389 comm=bash

Signed-off-by: Marco Elver 
---
v2:
* Add more documentation.
* Also show bprm->interp in trace.
* Rename to sched_prepare_exec.
---
 fs/exec.c|  8 
 include/trace/events/sched.h | 35 +++
 2 files changed, 43 insertions(+)

diff --git a/fs/exec.c b/fs/exec.c
index 38bf71cbdf5e..57fee729dd92 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1268,6 +1268,14 @@ int begin_new_exec(struct linux_binprm * bprm)
if (retval)
return retval;
 
+   /*
+* This tracepoint marks the point before flushing the old exec where
+* the current task is still unchanged, but errors are fatal (point of
+* no return). The later "sched_process_exec" tracepoint is called after
+* the current task has successfully switched to the new exec.
+*/
+   trace_sched_prepare_exec(current, bprm);
+
/*
 * Ensure all future errors are fatal.
 */
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index dbb01b4b7451..226f47c6939c 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -420,6 +420,41 @@ TRACE_EVENT(sched_process_exec,
  __entry->pid, __entry->old_pid)
 );
 
+/**
+ * sched_prepare_exec - called before setting up new exec
+ * @task:  pointer to the current task
+ * @bprm:  pointer to linux_binprm used for new exec
+ *
+ * Called before flushing the old exec, where @task is still unchanged, but at
+ * the point of no return during switching to the new exec. At the point it is
+ * called the exec will either succeed, or on failure terminate the task. Also
+ * see the "sched_process_exec" tracepoint, which is called right after @task
+ * has successfully switched to the new exec.
+ */
+TRACE_EVENT(sched_prepare_exec,
+
+   TP_PROTO(struct task_struct *task, struct linux_binprm *bprm),
+
+   TP_ARGS(task, bprm),
+
+   TP_STRUCT__entry(
+   __string(   interp, bprm->interp)
+   __string(   filename,   bprm->filename  )
+   __field(pid_t,  pid )
+   __string(   comm,   task->comm  )
+   ),
+
+   TP_fast_assign(
+   __assign_str(interp, bprm->interp);
+   __assign_str(filename, bprm->filename);
+   __entry->pid = task->pid;
+   __assign_str(comm, task->comm);
+   ),
+
+   TP_printk("interp=%s filename=%s pid=%d comm=%s",
+ __get_str(interp), __get_str(filename),
+ __entry->pid, __get_str(comm))
+);
 
 #ifdef CONFIG_SCHEDSTATS
 #define DEFINE_EVENT_SCHEDSTAT DEFINE_EVENT
-- 
2.44.0.478.gd926399ef9-goog




Re: [PATCH] tracing: Add new_exec tracepoint

2024-04-10 Thread Marco Elver
On Wed, 10 Apr 2024 at 15:56, Masami Hiramatsu  wrote:
>
> On Mon,  8 Apr 2024 11:01:54 +0200
> Marco Elver  wrote:
>
> > Add "new_exec" tracepoint, which is run right after the point of no
> > return but before the current task assumes its new exec identity.
> >
> > Unlike the tracepoint "sched_process_exec", the "new_exec" tracepoint
> > runs before flushing the old exec, i.e. while the task still has the
> > original state (such as original MM), but when the new exec either
> > succeeds or crashes (but never returns to the original exec).
> >
> > Being able to trace this event can be helpful in a number of use cases:
> >
> >   * allowing tracing eBPF programs access to the original MM on exec,
> > before current->mm is replaced;
> >   * counting exec in the original task (via perf event);
> >   * profiling flush time ("new_exec" to "sched_process_exec").
> >
> > Example of tracing output ("new_exec" and "sched_process_exec"):
>
> nit: "new_exec" name a bit stands out compared to other events, and hard to
> expect it comes before or after "sched_process_exec". Since "begin_new_exec"
> is internal implementation name, IMHO, it should not exposed to user.
> What do you think about calling this "sched_prepare_exec" ?

I like it, I'll rename it to sched_prepare_exec.

Thanks!



Re: [PATCH] tracing: Add new_exec tracepoint

2024-04-10 Thread Marco Elver
On Wed, 10 Apr 2024 at 01:54, Masami Hiramatsu  wrote:
>
> On Tue, 9 Apr 2024 16:45:47 +0200
> Marco Elver  wrote:
>
> > On Tue, 9 Apr 2024 at 16:31, Steven Rostedt  wrote:
> > >
> > > On Mon,  8 Apr 2024 11:01:54 +0200
> > > Marco Elver  wrote:
> > >
> > > > Add "new_exec" tracepoint, which is run right after the point of no
> > > > return but before the current task assumes its new exec identity.
> > > >
> > > > Unlike the tracepoint "sched_process_exec", the "new_exec" tracepoint
> > > > runs before flushing the old exec, i.e. while the task still has the
> > > > original state (such as original MM), but when the new exec either
> > > > succeeds or crashes (but never returns to the original exec).
> > > >
> > > > Being able to trace this event can be helpful in a number of use cases:
> > > >
> > > >   * allowing tracing eBPF programs access to the original MM on exec,
> > > > before current->mm is replaced;
> > > >   * counting exec in the original task (via perf event);
> > > >   * profiling flush time ("new_exec" to "sched_process_exec").
> > > >
> > > > Example of tracing output ("new_exec" and "sched_process_exec"):
> > >
> > > How common is this? And can't you just do the same with adding a kprobe?
> >
> > Our main use case would be to use this in BPF programs to become
> > exec-aware, where using the sched_process_exec hook is too late. This
> > is particularly important where the BPF program must stop inspecting
> > the user space's VM when the task does exec to become a new process.
>
> Just out of curiousity, would you like to audit that the user-program
> is not malformed? (security tracepoint?) I think that is an interesting
> idea. What kind of information you need?

I didn't have that in mind. If the BPF program reads (or even writes)
to user space memory, it must stop doing so before current->mm is
switched, otherwise it will lead to random results or memory
corruption. The new process may reallocate the memory that we want to
inspect, but the user space process must explicitly opt in to being
inspected or being manipulated. Just like the kernel "flushes" various
old state on exec since it's becoming a new process, a BPF program
that has per-process state needs to do the same.



Re: [PATCH] tracing: Add new_exec tracepoint

2024-04-09 Thread Marco Elver
On Tue, Apr 09, 2024 at 08:46AM -0700, Kees Cook wrote:
[...]
> > +   trace_new_exec(current, bprm);
> > +
> 
> All other steps in this function have explicit comments about
> what/why/etc. Please add some kind of comment describing why the
> tracepoint is where it is, etc.

I beefed up the tracepoint documentation, and wrote a little paragraph
above where it's called to reinforce what we want.

[...]
> What about binfmt_misc, and binfmt_script? You may want bprm->interp
> too?

Good points. I'll make the below changes for v2:

diff --git a/fs/exec.c b/fs/exec.c
index ab778ae1fc06..472b9f7b40e8 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1268,6 +1268,12 @@ int begin_new_exec(struct linux_binprm * bprm)
if (retval)
return retval;
 
+   /*
+* This tracepoint marks the point before flushing the old exec where
+* the current task is still unchanged, but errors are fatal (point of
+* no return). The later "sched_process_exec" tracepoint is called after
+* the current task has successfully switched to the new exec.
+*/
trace_new_exec(current, bprm);
 
/*
diff --git a/include/trace/events/task.h b/include/trace/events/task.h
index 8853dc44783d..623d9af777c1 100644
--- a/include/trace/events/task.h
+++ b/include/trace/events/task.h
@@ -61,8 +61,11 @@ TRACE_EVENT(task_rename,
  * @task:  pointer to the current task
  * @bprm:  pointer to linux_binprm used for new exec
  *
- * Called before flushing the old exec, but at the point of no return during
- * switching to the new exec.
+ * Called before flushing the old exec, where @task is still unchanged, but at
+ * the point of no return during switching to the new exec. At the point it is
+ * called the exec will either succeed, or on failure terminate the task. Also
+ * see the "sched_process_exec" tracepoint, which is called right after @task
+ * has successfully switched to the new exec.
  */
 TRACE_EVENT(new_exec,
 
@@ -71,19 +74,22 @@ TRACE_EVENT(new_exec,
TP_ARGS(task, bprm),
 
TP_STRUCT__entry(
+   __string(   interp, bprm->interp)
__string(   filename,   bprm->filename  )
__field(pid_t,  pid )
__string(   comm,   task->comm  )
),
 
TP_fast_assign(
+   __assign_str(interp, bprm->interp);
__assign_str(filename, bprm->filename);
__entry->pid = task->pid;
__assign_str(comm, task->comm);
),
 
-   TP_printk("filename=%s pid=%d comm=%s",
- __get_str(filename), __entry->pid, __get_str(comm))
+   TP_printk("interp=%s filename=%s pid=%d comm=%s",
+ __get_str(interp), __get_str(filename),
+ __entry->pid, __get_str(comm))
 );
 
 #endif



Re: [PATCH] tracing: Add new_exec tracepoint

2024-04-09 Thread Marco Elver
On Tue, 9 Apr 2024 at 16:31, Steven Rostedt  wrote:
>
> On Mon,  8 Apr 2024 11:01:54 +0200
> Marco Elver  wrote:
>
> > Add "new_exec" tracepoint, which is run right after the point of no
> > return but before the current task assumes its new exec identity.
> >
> > Unlike the tracepoint "sched_process_exec", the "new_exec" tracepoint
> > runs before flushing the old exec, i.e. while the task still has the
> > original state (such as original MM), but when the new exec either
> > succeeds or crashes (but never returns to the original exec).
> >
> > Being able to trace this event can be helpful in a number of use cases:
> >
> >   * allowing tracing eBPF programs access to the original MM on exec,
> > before current->mm is replaced;
> >   * counting exec in the original task (via perf event);
> >   * profiling flush time ("new_exec" to "sched_process_exec").
> >
> > Example of tracing output ("new_exec" and "sched_process_exec"):
>
> How common is this? And can't you just do the same with adding a kprobe?

Our main use case would be to use this in BPF programs to become
exec-aware, where using the sched_process_exec hook is too late. This
is particularly important where the BPF program must stop inspecting
the user space's VM when the task does exec to become a new process.

kprobe (or BPF's fentry) is brittle here, because begin_new_exec()'s
permission check can still return an error which returns to the
original task without crashing. Only at the point of no return are we
guaranteed that the exec either succeeds, or the task is terminated on
failure.

I don't know if "common" is the right question here, because it's a
chicken-egg problem: no tracepoint, we give up; we have the
tracepoint, it unlocks a range of new use cases (that require robust
solution to make BPF programs exec-aware, and a tracepoint is the only
option IMHO).

Thanks,
-- Marco



[PATCH] tracing: Add new_exec tracepoint

2024-04-08 Thread Marco Elver
Add "new_exec" tracepoint, which is run right after the point of no
return but before the current task assumes its new exec identity.

Unlike the tracepoint "sched_process_exec", the "new_exec" tracepoint
runs before flushing the old exec, i.e. while the task still has the
original state (such as original MM), but when the new exec either
succeeds or crashes (but never returns to the original exec).

Being able to trace this event can be helpful in a number of use cases:

  * allowing tracing eBPF programs access to the original MM on exec,
before current->mm is replaced;
  * counting exec in the original task (via perf event);
  * profiling flush time ("new_exec" to "sched_process_exec").

Example of tracing output ("new_exec" and "sched_process_exec"):

  $ cat /sys/kernel/debug/tracing/trace_pipe
  <...>-379 [003] .   179.626921: new_exec: filename=/usr/bin/sshd 
pid=379 comm=sshd
  <...>-379 [003] .   179.629131: sched_process_exec: 
filename=/usr/bin/sshd pid=379 old_pid=379
  <...>-381 [002] .   180.048580: new_exec: filename=/bin/bash 
pid=381 comm=sshd
  <...>-381 [002] .   180.053122: sched_process_exec: 
filename=/bin/bash pid=381 old_pid=381
  <...>-385 [001] .   180.068277: new_exec: filename=/usr/bin/tty 
pid=385 comm=bash
  <...>-385 [001] .   180.069485: sched_process_exec: 
filename=/usr/bin/tty pid=385 old_pid=385
  <...>-389 [006] .   192.020147: new_exec: filename=/usr/bin/dmesg 
pid=389 comm=bash
   bash-389     [006] .   192.021377: sched_process_exec: 
filename=/usr/bin/dmesg pid=389 old_pid=389

Signed-off-by: Marco Elver 
---
 fs/exec.c   |  2 ++
 include/trace/events/task.h | 30 ++
 2 files changed, 32 insertions(+)

diff --git a/fs/exec.c b/fs/exec.c
index 38bf71cbdf5e..ab778ae1fc06 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1268,6 +1268,8 @@ int begin_new_exec(struct linux_binprm * bprm)
if (retval)
return retval;
 
+   trace_new_exec(current, bprm);
+
/*
 * Ensure all future errors are fatal.
 */
diff --git a/include/trace/events/task.h b/include/trace/events/task.h
index 47b527464d1a..8853dc44783d 100644
--- a/include/trace/events/task.h
+++ b/include/trace/events/task.h
@@ -56,6 +56,36 @@ TRACE_EVENT(task_rename,
__entry->newcomm, __entry->oom_score_adj)
 );
 
+/**
+ * new_exec - called before setting up new exec
+ * @task:  pointer to the current task
+ * @bprm:  pointer to linux_binprm used for new exec
+ *
+ * Called before flushing the old exec, but at the point of no return during
+ * switching to the new exec.
+ */
+TRACE_EVENT(new_exec,
+
+   TP_PROTO(struct task_struct *task, struct linux_binprm *bprm),
+
+   TP_ARGS(task, bprm),
+
+   TP_STRUCT__entry(
+   __string(   filename,   bprm->filename  )
+   __field(pid_t,  pid )
+   __string(   comm,   task->comm  )
+   ),
+
+   TP_fast_assign(
+   __assign_str(filename, bprm->filename);
+   __entry->pid = task->pid;
+   __assign_str(comm, task->comm);
+   ),
+
+   TP_printk("filename=%s pid=%d comm=%s",
+ __get_str(filename), __entry->pid, __get_str(comm))
+);
+
 #endif
 
 /* This part must be outside protection */
-- 
2.44.0.478.gd926399ef9-goog




Re: [PATCH v3 1/3] kasan: switch kunit tests to console tracepoints

2023-12-12 Thread Marco Elver
On Tue, 12 Dec 2023 at 10:19, Paul Heidekrüger  wrote:
>
> On 12.12.2023 00:37, Andrey Konovalov wrote:
> > On Tue, Dec 12, 2023 at 12:35 AM Paul Heidekrüger
> >  wrote:
> > >
> > > Using CONFIG_FTRACE=y instead of CONFIG_TRACEPOINTS=y produces the same 
> > > error
> > > for me.
> > >
> > > So
> > >
> > > CONFIG_KUNIT=y
> > > CONFIG_KUNIT_ALL_TESTS=n
> > > CONFIG_FTRACE=y
> > > CONFIG_KASAN=y
> > > CONFIG_KASAN_GENERIC=y
> > > CONFIG_KASAN_KUNIT_TEST=y
> > >
> > > produces
> > >
> > > ➜   ./tools/testing/kunit/kunit.py run 
> > > --kunitconfig=mm/kasan/.kunitconfig --arch=arm64
> > > Configuring KUnit Kernel ...
> > > Regenerating .config ...
> > > Populating config with:
> > > $ make ARCH=arm64 O=.kunit olddefconfig CC=clang
> > > ERROR:root:Not all Kconfig options selected in kunitconfig were 
> > > in the generated .config.
> > > This is probably due to unsatisfied dependencies.
> > > Missing: CONFIG_KASAN_KUNIT_TEST=y
> > >
> > > By that error message, CONFIG_FTRACE appears to be present in the 
> > > generated
> > > config, but CONFIG_KASAN_KUNIT_TEST still isn't. Presumably,
> > > CONFIG_KASAN_KUNIT_TEST is missing because of an unsatisfied dependency, 
> > > which
> > > must be CONFIG_TRACEPOINTS, unless I'm missing something ...
> > >
> > > If I just generate an arm64 defconfig and select CONFIG_FTRACE=y,
> > > CONFIG_TRACEPOINTS=y shows up in my .config. So, maybe this is 
> > > kunit.py-related
> > > then?
> > >
> > > Andrey, you said that the tests have been working for you; are you 
> > > running them
> > > with kunit.py?
> >
> > No, I just run the kernel built with a config file that I put together
> > based on defconfig.
>
> Ah. I believe I've figured it out.
>
> When I add CONFIG_STACK_TRACER=y in addition to CONFIG_FTRACE=y, it works.

CONFIG_FTRACE should be enough - maybe also check x86 vs. arm64 to debug more.

> CONFIG_STACK_TRACER selects CONFIG_FUNCTION_TRACER, CONFIG_FUNCTION_TRACER
> selects CONFIG_GENERIC_TRACER, CONFIG_GENERIC_TRACER selects CONFIG_TRACING, 
> and
> CONFIG_TRACING selects CONFIG_TRACEPOINTS.
>
> CONFIG_BLK_DEV_IO_TRACE=y also works instead of CONFIG_STACK_TRACER=y, as it
> directly selects CONFIG_TRACEPOINTS.
>
> CONFIG_FTRACE=y on its own does not appear suffice for kunit.py on arm64.

When you build manually with just CONFIG_FTRACE, is CONFIG_TRACEPOINTS enabled?

> I believe the reason my .kunitconfig as well as the existing
> mm/kfence/.kunitconfig work on X86 is because CONFIG_TRACEPOINTS=y is present 
> in
> an X86 defconfig.
>
> Does this make sense?
>
> Would you welcome a patch addressing this for the existing
> mm/kfence/.kunitconfig?
>
> I would also like to submit a patch for an mm/kasan/.kunitconfig. Do you think
> that would be helpful too?
>
> FWICT, kernel/kcsan/.kunitconfig might also be affected since
> CONFIG_KCSAN_KUNIT_TEST also depends on CONFIG_TRACEPOITNS, but I would have 
> to
> test that. That could be a third patch.

I'd support figuring out the minimal config (CONFIG_FTRACE or
something else?) that satisfies the TRACEPOINTS dependency. I always
thought CONFIG_FTRACE ought to be the one config option, but maybe
something changed.

Also maybe one of the tracing maintainers can help untangle what's
going on here.

Thanks,
-- Marco



Re: [PATCH v3 1/3] kasan: switch kunit tests to console tracepoints

2023-12-11 Thread Marco Elver
On Mon, 11 Dec 2023 at 23:48, Paul Heidekrüger  wrote:
>
> On 11.12.2023 21:51, Andrey Konovalov wrote:
> > On Mon, Dec 11, 2023 at 7:59 PM Paul Heidekrüger
> >  wrote:
> > >
> > > > Hi Paul,
> > > >
> > > > I've been successfully running KASAN tests with CONFIG_TRACEPOINTS
> > > > enabled on arm64 since this patch landed.
> > >
> > > Interesting ...
> > >
> > > > What happens when you try running the tests with .kunitconfig? Does
> > > > CONFIG_TRACEPOINTS or CONFIG_KASAN_KUNIT_TEST get disabled during
> > > > kernel building?
> > >
> > > Yes, exactly, that's what's happening.
> > >
> > > Here's the output kunit.py is giving me. I replaced CONFIG_DEBUG_KERNEL 
> > > with
> > > CONFIG_TRACEPOINTS in my .kunitconfig. Otherwise, it's identical with the 
> > > one I
> > > posted above.
> > >
> > > ➜   ./tools/testing/kunit/kunit.py run 
> > > --kunitconfig=mm/kasan/.kunitconfig --arch=arm64
> > > Configuring KUnit Kernel ...
> > > Regenerating .config ...
> > > Populating config with:
> > > $ make ARCH=arm64 O=.kunit olddefconfig
> > > ERROR:root:Not all Kconfig options selected in kunitconfig were 
> > > in the generated .config.
> > > This is probably due to unsatisfied dependencies.
> > > Missing: CONFIG_KASAN_KUNIT_TEST=y, CONFIG_TRACEPOINTS=y
> > >
> > > Does CONFIG_TRACEPOINTS have some dependency I'm not seeing? I couldn't 
> > > find a
> > > reason why it would get disabled, but I could definitely be wrong.
> >
> > Does your .kunitconfig include CONFIG_TRACEPOINTS=y? I don't see it in
> > the listing that you sent earlier.
>
> Yes. For the kunit.py output from my previous email, I replaced
> CONFIG_DEBUG_KERNEL=y with CONFIG_TRACEPOINTS=y. So, the .kunitconfig I used 
> to
> produce the output above was:
>
> CONFIG_KUNIT=y
> CONFIG_KUNIT_ALL_TESTS=n
> CONFIG_TRACEPOINTS=y
> CONFIG_KASAN=y
> CONFIG_KASAN_GENERIC=y
> CONFIG_KASAN_KUNIT_TEST=y
>
> This more or less mirrors what mm/kfence/.kunitconfig is doing, which also 
> isn't
> working on my side; kunit.py reports the same error.

mm/kfence/.kunitconfig does CONFIG_FTRACE=y. TRACEPOINTS is not user
selectable. I don't think any of this has changed since the initial
discussion above, so CONFIG_FTRACE=y is still needed.



Re: [PATCH] slub: Introduce CONFIG_SLUB_RCU_DEBUG

2023-09-11 Thread Marco Elver
On Fri, 25 Aug 2023 at 23:15, 'Jann Horn' via kasan-dev
 wrote:
>
> Currently, KASAN is unable to catch use-after-free in SLAB_TYPESAFE_BY_RCU
> slabs because use-after-free is allowed within the RCU grace period by
> design.
>
> Add a SLUB debugging feature which RCU-delays every individual
> kmem_cache_free() before either actually freeing the object or handing it
> off to KASAN, and change KASAN to poison freed objects as normal when this
> option is enabled.
>
> Note that this creates a 16-byte unpoisoned area in the middle of the
> slab metadata area, which kinda sucks but seems to be necessary in order
> to be able to store an rcu_head in there without triggering an ASAN
> splat during RCU callback processing.
>
> For now I've configured Kconfig.kasan to always enable this feature in the
> GENERIC and SW_TAGS modes; I'm not forcibly enabling it in HW_TAGS mode
> because I'm not sure if it might have unwanted performance degradation
> effects there.
>
> Signed-off-by: Jann Horn 
> ---
> can I get a review from the KASAN folks of this?
> I have been running it on my laptop for a bit and it seems to be working
> fine.
>
> Notes:
> With this patch, a UAF on a TYPESAFE_BY_RCU will splat with an error
> like this (tested by reverting a security bugfix).
> Note that, in the ASAN memory state dump, we can see the little
> unpoisoned 16-byte areas storing the rcu_head.
>
> BUG: KASAN: slab-use-after-free in folio_lock_anon_vma_read+0x129/0x4c0
> Read of size 8 at addr 888004e85b00 by task forkforkfork/592
>
> CPU: 0 PID: 592 Comm: forkforkfork Not tainted 
> 6.5.0-rc7-00105-gae70c1e1f6f5-dirty #334
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
> 1.16.2-debian-1.16.2-1 04/01/2014
> Call Trace:
>  
>  dump_stack_lvl+0x4a/0x80
>  print_report+0xcf/0x660
>  kasan_report+0xd4/0x110
>  folio_lock_anon_vma_read+0x129/0x4c0
>  rmap_walk_anon+0x1cc/0x290
>  folio_referenced+0x277/0x2a0
>  shrink_folio_list+0xb8c/0x1680
>  reclaim_folio_list+0xdc/0x1f0
>  reclaim_pages+0x211/0x280
>  madvise_cold_or_pageout_pte_range+0x812/0xb70
>  walk_pgd_range+0x70b/0xce0
>  __walk_page_range+0x343/0x360
>  walk_page_range+0x227/0x280
>  madvise_pageout+0x1cd/0x2d0
>  do_madvise+0x552/0x15a0
>  __x64_sys_madvise+0x62/0x70
>  do_syscall_64+0x3b/0x90
>  entry_SYSCALL_64_after_hwframe+0x6e/0xd8
> [...]
>  
>
> Allocated by task 574:
>  kasan_save_stack+0x33/0x60
>  kasan_set_track+0x25/0x30
>  __kasan_slab_alloc+0x6e/0x70
>  kmem_cache_alloc+0xfd/0x2b0
>  anon_vma_fork+0x88/0x270
>  dup_mmap+0x87c/0xc10
>  copy_process+0x3399/0x3590
>  kernel_clone+0x10e/0x480
>  __do_sys_clone+0xa1/0xe0
>  do_syscall_64+0x3b/0x90
>  entry_SYSCALL_64_after_hwframe+0x6e/0xd8
>
> Freed by task 0:
>  kasan_save_stack+0x33/0x60
>  kasan_set_track+0x25/0x30
>  kasan_save_free_info+0x2b/0x50
>  __kasan_slab_free+0xfe/0x180
>  slab_free_after_rcu_debug+0xad/0x200
>  rcu_core+0x638/0x1620
>  __do_softirq+0x14c/0x581
>
> Last potentially related work creation:
>  kasan_save_stack+0x33/0x60
>  __kasan_record_aux_stack+0x94/0xa0
>  __call_rcu_common.constprop.0+0x47/0x730
>  __put_anon_vma+0x6e/0x150
>  unlink_anon_vmas+0x277/0x2e0
>  vma_complete+0x341/0x580
>  vma_merge+0x613/0xff0
>  mprotect_fixup+0x1c0/0x510
>  do_mprotect_pkey+0x5a7/0x710
>  __x64_sys_mprotect+0x47/0x60
>  do_syscall_64+0x3b/0x90
>  entry_SYSCALL_64_after_hwframe+0x6e/0xd8
>
> Second to last potentially related work creation:
> [...]
>
> The buggy address belongs to the object at 888004e85b00
>  which belongs to the cache anon_vma of size 192
> The buggy address is located 0 bytes inside of
>  freed 192-byte region [888004e85b00, 888004e85bc0)
>
> The buggy address belongs to the physical page:
> [...]
>
> Memory state around the buggy address:
>  888004e85a00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
>  888004e85a80: 00 00 00 00 00 00 00 00 fc 00 00 fc fc fc fc fc
> >888004e85b00: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>^
>  888004e85b80: fb fb fb fb fb fb fb fb fc 00 00 fc fc fc fc fc
>  888004e85c00: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>
>  include/linux/kasan.h|  6 
>  include/linux/slub_def.h |  3 ++
>  lib/Kconfig.kasan|  2 ++
>  mm/Kconfig.debug | 21 +
>  mm/kasan/common.c| 15 -
>  mm/slub.c| 66 +---

Nice!

It'd be good to add a test case to lib/test_kasan module. I think you
could just copy/adjust the test case "test_memcache_typesafe_by_rcu"
from the KFENCE KUnit test suite.

>  6 files changed, 107 insertions(+), 6 deletions(-)
>
> diff --git a/include/linux/kasan.h 

Re: [PATCH v4 05/10] signal: Introduce TRAP_PERF si_code and si_perf to siginfo

2021-04-20 Thread Marco Elver
On Tue, 20 Apr 2021 at 23:26, Marek Szyprowski  wrote:
>
> Hi Marco,
>
> On 08.04.2021 12:36, Marco Elver wrote:
> > Introduces the TRAP_PERF si_code, and associated siginfo_t field
> > si_perf. These will be used by the perf event subsystem to send signals
> > (if requested) to the task where an event occurred.
> >
> > Acked-by: Geert Uytterhoeven  # m68k
> > Acked-by: Arnd Bergmann  # asm-generic
> > Signed-off-by: Marco Elver 
>
> This patch landed in linux-next as commit fb6cc127e0b6 ("signal:
> Introduce TRAP_PERF si_code and si_perf to siginfo"). It causes
> regression on my test systems (arm 32bit and 64bit). Most systems fails
> to boot in the given time frame. I've observed that there is a timeout
> waiting for udev to populate /dev and then also during the network
> interfaces configuration. Reverting this commit, together with
> 97ba62b27867 ("perf: Add support for SIGTRAP on perf events") to let it
> compile, on top of next-20210420 fixes the issue.

Thanks, this is weird for sure and nothing in particular stands out.

I have questions:
-- Can you please share your config?
-- Also, can you share how you run this? Can it be reproduced in qemu?
-- How did you derive this patch to be at fault? Why not just
97ba62b27867, given you also need to revert it?

If you are unsure which patch exactly it is, can you try just
reverting 97ba62b27867 and see what happens?

Thanks,
-- Marco

> > ---
> >   arch/m68k/kernel/signal.c  |  3 +++
> >   arch/x86/kernel/signal_compat.c|  5 -
> >   fs/signalfd.c  |  4 
> >   include/linux/compat.h |  2 ++
> >   include/linux/signal.h |  1 +
> >   include/uapi/asm-generic/siginfo.h |  6 +-
> >   include/uapi/linux/signalfd.h  |  4 +++-
> >   kernel/signal.c| 11 +++
> >   8 files changed, 33 insertions(+), 3 deletions(-)
> >
> > diff --git a/arch/m68k/kernel/signal.c b/arch/m68k/kernel/signal.c
> > index 349570f16a78..a4b7ee1df211 100644
> > --- a/arch/m68k/kernel/signal.c
> > +++ b/arch/m68k/kernel/signal.c
> > @@ -622,6 +622,9 @@ static inline void siginfo_build_tests(void)
> >   /* _sigfault._addr_pkey */
> >   BUILD_BUG_ON(offsetof(siginfo_t, si_pkey) != 0x12);
> >
> > + /* _sigfault._perf */
> > + BUILD_BUG_ON(offsetof(siginfo_t, si_perf) != 0x10);
> > +
> >   /* _sigpoll */
> >   BUILD_BUG_ON(offsetof(siginfo_t, si_band)   != 0x0c);
> >   BUILD_BUG_ON(offsetof(siginfo_t, si_fd) != 0x10);
> > diff --git a/arch/x86/kernel/signal_compat.c 
> > b/arch/x86/kernel/signal_compat.c
> > index a5330ff498f0..0e5d0a7e203b 100644
> > --- a/arch/x86/kernel/signal_compat.c
> > +++ b/arch/x86/kernel/signal_compat.c
> > @@ -29,7 +29,7 @@ static inline void signal_compat_build_tests(void)
> >   BUILD_BUG_ON(NSIGFPE  != 15);
> >   BUILD_BUG_ON(NSIGSEGV != 9);
> >   BUILD_BUG_ON(NSIGBUS  != 5);
> > - BUILD_BUG_ON(NSIGTRAP != 5);
> > + BUILD_BUG_ON(NSIGTRAP != 6);
> >   BUILD_BUG_ON(NSIGCHLD != 6);
> >   BUILD_BUG_ON(NSIGSYS  != 2);
> >
> > @@ -138,6 +138,9 @@ static inline void signal_compat_build_tests(void)
> >   BUILD_BUG_ON(offsetof(siginfo_t, si_pkey) != 0x20);
> >   BUILD_BUG_ON(offsetof(compat_siginfo_t, si_pkey) != 0x14);
> >
> > + BUILD_BUG_ON(offsetof(siginfo_t, si_perf) != 0x18);
> > + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_perf) != 0x10);
> > +
> >   CHECK_CSI_OFFSET(_sigpoll);
> >   CHECK_CSI_SIZE  (_sigpoll, 2*sizeof(int));
> >   CHECK_SI_SIZE   (_sigpoll, 4*sizeof(int));
> > diff --git a/fs/signalfd.c b/fs/signalfd.c
> > index 456046e15873..040a1142915f 100644
> > --- a/fs/signalfd.c
> > +++ b/fs/signalfd.c
> > @@ -134,6 +134,10 @@ static int signalfd_copyinfo(struct signalfd_siginfo 
> > __user *uinfo,
> >   #endif
> >   new.ssi_addr_lsb = (short) kinfo->si_addr_lsb;
> >   break;
> > + case SIL_PERF_EVENT:
> > + new.ssi_addr = (long) kinfo->si_addr;
> > + new.ssi_perf = kinfo->si_perf;
> > + break;
> >   case SIL_CHLD:
> >   new.ssi_pid= kinfo->si_pid;
> >   new.ssi_uid= kinfo->si_uid;
> > diff --git a/include/linux/compat.h b/include/linux/compat.h
> > index 6e65be753603..c8821d966812 100644
> > --- a/include/linux/compat.h
> > +++ b/include/linux/compat.h
> > @@ -236,6 +236,8 @@ typedef struct compat_siginfo {
> >  

Re: [PATCH 1/3] kfence: await for allocation using wait_event

2021-04-19 Thread Marco Elver
On Mon, 19 Apr 2021 at 11:44, Marco Elver  wrote:
>
> On Mon, 19 Apr 2021 at 11:41, Hillf Danton  wrote:
> >
> > On Mon, 19 Apr 2021 10:50:25 Marco Elver wrote:
> > > +
> > > + WRITE_ONCE(kfence_timer_waiting, true);
> > > + smp_mb(); /* See comment in __kfence_alloc(). */
> >
> > This is not needed given task state change in wait_event().
>
> Yes it is. We want to avoid the unconditional irq_work in
> __kfence_alloc(). When the system is under load doing frequent
> allocations, at least in my tests this avoids the irq_work almost
> always. Without the irq_work you'd be correct of course.

And in case this is about the smp_mb() here, yes it definitely is
required. We *must* order the write of kfence_timer_waiting *before*
the check of kfence_allocation_gate, which wait_event() does before
anything else (including changing the state). Otherwise the write may
be reordered after the read, and we could potentially never wake up
because __kfence_alloc() not waking us.

This is documented in __kfence_alloc().

> > > + wait_event_timeout(allocation_wait, 
> > > atomic_read(_allocation_gate), HZ);
> > > + smp_store_release(_timer_waiting, false); /* Order after 
> > > wait_event(). */
> > > +


Re: [PATCH 1/3] kfence: await for allocation using wait_event

2021-04-19 Thread Marco Elver
On Mon, 19 Apr 2021 at 11:41, Hillf Danton  wrote:
>
> On Mon, 19 Apr 2021 10:50:25 Marco Elver wrote:
> > +
> > + WRITE_ONCE(kfence_timer_waiting, true);
> > + smp_mb(); /* See comment in __kfence_alloc(). */
>
> This is not needed given task state change in wait_event().

Yes it is. We want to avoid the unconditional irq_work in
__kfence_alloc(). When the system is under load doing frequent
allocations, at least in my tests this avoids the irq_work almost
always. Without the irq_work you'd be correct of course.

> > + wait_event_timeout(allocation_wait, 
> > atomic_read(_allocation_gate), HZ);
> > + smp_store_release(_timer_waiting, false); /* Order after 
> > wait_event(). */
> > +


[PATCH 3/3] kfence: use power-efficient work queue to run delayed work

2021-04-19 Thread Marco Elver
Use the power-efficient work queue, to avoid the pathological case where
we keep pinning ourselves on the same possibly idle CPU on systems that
want to be power-efficient [1].
[1] https://lwn.net/Articles/731052/

Signed-off-by: Marco Elver 
---
 mm/kfence/core.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index 73e7b621fb36..7e20cd9690a2 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -642,7 +642,8 @@ static void toggle_allocation_gate(struct work_struct *work)
/* Disable static key and reset timer. */
static_branch_disable(_allocation_key);
 #endif
-   schedule_delayed_work(_timer, 
msecs_to_jiffies(kfence_sample_interval));
+   queue_delayed_work(system_power_efficient_wq, _timer,
+  msecs_to_jiffies(kfence_sample_interval));
 }
 static DECLARE_DELAYED_WORK(kfence_timer, toggle_allocation_gate);
 
@@ -671,7 +672,7 @@ void __init kfence_init(void)
}
 
WRITE_ONCE(kfence_enabled, true);
-   schedule_delayed_work(_timer, 0);
+   queue_delayed_work(system_power_efficient_wq, _timer, 0);
pr_info("initialized - using %lu bytes for %d objects at 0x%p-0x%p\n", 
KFENCE_POOL_SIZE,
CONFIG_KFENCE_NUM_OBJECTS, (void *)__kfence_pool,
(void *)(__kfence_pool + KFENCE_POOL_SIZE));
-- 
2.31.1.368.gbe11c130af-goog



[PATCH 2/3] kfence: maximize allocation wait timeout duration

2021-04-19 Thread Marco Elver
The allocation wait timeout was initially added because of warnings due
to CONFIG_DETECT_HUNG_TASK=y [1]. While the 1 sec timeout is sufficient
to resolve the warnings (given the hung task timeout must be 1 sec or
larger) it may cause unnecessary wake-ups if the system is idle.
[1] 
https://lkml.kernel.org/r/CADYN=9j0dqhizagb0-jz4hobbh+05kmbxb4c0cxms7qi5na...@mail.gmail.com

Fix it by computing the timeout duration in terms of the current
sysctl_hung_task_timeout_secs value.

Signed-off-by: Marco Elver 
---
 mm/kfence/core.c | 12 +++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index 5f0a56041549..73e7b621fb36 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -626,7 +627,16 @@ static void toggle_allocation_gate(struct work_struct 
*work)
 
WRITE_ONCE(kfence_timer_waiting, true);
smp_mb(); /* See comment in __kfence_alloc(). */
-   wait_event_timeout(allocation_wait, 
atomic_read(_allocation_gate), HZ);
+   if (sysctl_hung_task_timeout_secs) {
+   /*
+* During low activity with no allocations we might wait a
+* while; let's avoid the hung task warning.
+*/
+   wait_event_timeout(allocation_wait, 
atomic_read(_allocation_gate),
+  sysctl_hung_task_timeout_secs * HZ / 2);
+   } else {
+   wait_event(allocation_wait, 
atomic_read(_allocation_gate));
+   }
smp_store_release(_timer_waiting, false); /* Order after 
wait_event(). */
 
/* Disable static key and reset timer. */
-- 
2.31.1.368.gbe11c130af-goog



[PATCH 1/3] kfence: await for allocation using wait_event

2021-04-19 Thread Marco Elver
On mostly-idle systems, we have observed that toggle_allocation_gate()
is a cause of frequent wake-ups, preventing an otherwise idle CPU to go
into a lower power state.

A late change in KFENCE's development, due to a potential deadlock [1],
required changing the scheduling-friendly wait_event_timeout() and
wake_up() to an open-coded wait-loop using schedule_timeout().
[1] https://lkml.kernel.org/r/c0645805b7f98...@google.com

To avoid unnecessary wake-ups, switch to using wait_event_timeout().

Unfortunately, we still cannot use a version with direct wake_up() in
__kfence_alloc() due to the same potential for deadlock as in [1].
Instead, add a level of indirection via an irq_work that is scheduled if
we determine that the kfence_timer requires a wake_up().

Fixes: 0ce20dd84089 ("mm: add Kernel Electric-Fence infrastructure")
Signed-off-by: Marco Elver 
---
 lib/Kconfig.kfence |  1 +
 mm/kfence/core.c   | 58 +-
 2 files changed, 43 insertions(+), 16 deletions(-)

diff --git a/lib/Kconfig.kfence b/lib/Kconfig.kfence
index 78f50ccb3b45..e641add33947 100644
--- a/lib/Kconfig.kfence
+++ b/lib/Kconfig.kfence
@@ -7,6 +7,7 @@ menuconfig KFENCE
bool "KFENCE: low-overhead sampling-based memory safety error detector"
depends on HAVE_ARCH_KFENCE && (SLAB || SLUB)
select STACKTRACE
+   select IRQ_WORK
help
  KFENCE is a low-overhead sampling-based detector of heap out-of-bounds
  access, use-after-free, and invalid-free errors. KFENCE is designed
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index 768dbd58170d..5f0a56041549 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -10,6 +10,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -587,6 +588,20 @@ late_initcall(kfence_debugfs_init);
 
 /* === Allocation Gate Timer  
*/
 
+#ifdef CONFIG_KFENCE_STATIC_KEYS
+/* Wait queue to wake up allocation-gate timer task. */
+static DECLARE_WAIT_QUEUE_HEAD(allocation_wait);
+
+static void wake_up_kfence_timer(struct irq_work *work)
+{
+   wake_up(_wait);
+}
+static DEFINE_IRQ_WORK(wake_up_kfence_timer_work, wake_up_kfence_timer);
+
+/* Indicate if timer task is waiting, to avoid unnecessary irq_work. */
+static bool kfence_timer_waiting;
+#endif
+
 /*
  * Set up delayed work, which will enable and disable the static key. We need 
to
  * use a work queue (rather than a simple timer), since enabling and disabling 
a
@@ -604,25 +619,16 @@ static void toggle_allocation_gate(struct work_struct 
*work)
if (!READ_ONCE(kfence_enabled))
return;
 
-   /* Enable static key, and await allocation to happen. */
atomic_set(_allocation_gate, 0);
 #ifdef CONFIG_KFENCE_STATIC_KEYS
+   /* Enable static key, and await allocation to happen. */
static_branch_enable(_allocation_key);
-   /*
-* Await an allocation. Timeout after 1 second, in case the kernel stops
-* doing allocations, to avoid stalling this worker task for too long.
-*/
-   {
-   unsigned long end_wait = jiffies + HZ;
-
-   do {
-   set_current_state(TASK_UNINTERRUPTIBLE);
-   if (atomic_read(_allocation_gate) != 0)
-   break;
-   schedule_timeout(1);
-   } while (time_before(jiffies, end_wait));
-   __set_current_state(TASK_RUNNING);
-   }
+
+   WRITE_ONCE(kfence_timer_waiting, true);
+   smp_mb(); /* See comment in __kfence_alloc(). */
+   wait_event_timeout(allocation_wait, 
atomic_read(_allocation_gate), HZ);
+   smp_store_release(_timer_waiting, false); /* Order after 
wait_event(). */
+
/* Disable static key and reset timer. */
static_branch_disable(_allocation_key);
 #endif
@@ -729,6 +735,26 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, 
gfp_t flags)
 */
if (atomic_read(_allocation_gate) || 
atomic_inc_return(_allocation_gate) > 1)
return NULL;
+#ifdef CONFIG_KFENCE_STATIC_KEYS
+   /*
+* Read of kfence_timer_waiting must be ordered after write to
+* kfence_allocation_gate (fully ordered per atomic_inc_return()).
+*
+* Conversely, the write to kfence_timer_waiting must be ordered before
+* the check of kfence_allocation_gate in toggle_allocation_gate().
+*
+* This ensures that toggle_allocation_gate() always sees the updated
+* kfence_allocation_gate, or we see that the timer is waiting and will
+* queue the work to wake it up.
+*/
+   if (READ_ONCE(kfence_timer_waiting)) {
+   /*
+* Calling wake_up() here may deadlock when allocations happen
+* from within timer code. Use an irq_work to defer it.
+*/

[PATCH 0/3] kfence: optimize timer scheduling

2021-04-19 Thread Marco Elver
We have observed that mostly-idle systems with KFENCE enabled wake up
otherwise idle CPUs, preventing such to enter a lower power state.
Debugging revealed that KFENCE spends too much active time in
toggle_allocation_gate().

While the first version of KFENCE was using all the right bits to be
scheduling optimal, and thus power efficient, by simply using
wait_event() + wake_up(), that code was unfortunately removed.

As KFENCE was exposed to various different configs and tests, the
scheduling optimal code slowly disappeared. First because of hung task
warnings, and finally because of deadlocks when an allocation is made by
timer code with debug objects enabled. Clearly, the "fixes" were not too
friendly for devices that want to be power efficient.

Therefore, let's try a little harder to fix the hung task and deadlock
problems that we have with wait_event() + wake_up(), while remaining as
scheduling friendly and power efficient as possible.

Crucially, we need to defer the wake_up() to an irq_work, avoiding any
potential for deadlock.

The result with this series is that on the devices where we observed a
power regression, power usage returns back to baseline levels.

Marco Elver (3):
  kfence: await for allocation using wait_event
  kfence: maximize allocation wait timeout duration
  kfence: use power-efficient work queue to run delayed work

 lib/Kconfig.kfence |  1 +
 mm/kfence/core.c   | 71 +++---
 2 files changed, 55 insertions(+), 17 deletions(-)

-- 
2.31.1.368.gbe11c130af-goog



[tip: perf/core] perf: Apply PERF_EVENT_IOC_MODIFY_ATTRIBUTES to children

2021-04-16 Thread tip-bot2 for Marco Elver
The following commit has been merged into the perf/core branch of tip:

Commit-ID: 47f661eca0700928012e11c57ea0328f5ccfc3b9
Gitweb:
https://git.kernel.org/tip/47f661eca0700928012e11c57ea0328f5ccfc3b9
Author:Marco Elver 
AuthorDate:Thu, 08 Apr 2021 12:35:57 +02:00
Committer: Peter Zijlstra 
CommitterDate: Fri, 16 Apr 2021 16:32:40 +02:00

perf: Apply PERF_EVENT_IOC_MODIFY_ATTRIBUTES to children

As with other ioctls (such as PERF_EVENT_IOC_{ENABLE,DISABLE}), fix up
handling of PERF_EVENT_IOC_MODIFY_ATTRIBUTES to also apply to children.

Suggested-by: Dmitry Vyukov 
Signed-off-by: Marco Elver 
Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Dmitry Vyukov 
Link: https://lkml.kernel.org/r/20210408103605.1676875-3-el...@google.com
---
 kernel/events/core.c | 22 +-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 318ff7b..10ed2cd 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3200,16 +3200,36 @@ static int perf_event_modify_breakpoint(struct 
perf_event *bp,
 static int perf_event_modify_attr(struct perf_event *event,
  struct perf_event_attr *attr)
 {
+   int (*func)(struct perf_event *, struct perf_event_attr *);
+   struct perf_event *child;
+   int err;
+
if (event->attr.type != attr->type)
return -EINVAL;
 
switch (event->attr.type) {
case PERF_TYPE_BREAKPOINT:
-   return perf_event_modify_breakpoint(event, attr);
+   func = perf_event_modify_breakpoint;
+   break;
default:
/* Place holder for future additions. */
return -EOPNOTSUPP;
}
+
+   WARN_ON_ONCE(event->ctx->parent_ctx);
+
+   mutex_lock(>child_mutex);
+   err = func(event, attr);
+   if (err)
+   goto out;
+   list_for_each_entry(child, >child_list, child_list) {
+   err = func(child, attr);
+   if (err)
+   goto out;
+   }
+out:
+   mutex_unlock(>child_mutex);
+   return err;
 }
 
 static void ctx_sched_out(struct perf_event_context *ctx,


[tip: perf/core] perf: Support only inheriting events if cloned with CLONE_THREAD

2021-04-16 Thread tip-bot2 for Marco Elver
The following commit has been merged into the perf/core branch of tip:

Commit-ID: 2b26f0aa004995f49f7b6f4100dd0e4c39a9ed5f
Gitweb:
https://git.kernel.org/tip/2b26f0aa004995f49f7b6f4100dd0e4c39a9ed5f
Author:Marco Elver 
AuthorDate:Thu, 08 Apr 2021 12:35:58 +02:00
Committer: Peter Zijlstra 
CommitterDate: Fri, 16 Apr 2021 16:32:40 +02:00

perf: Support only inheriting events if cloned with CLONE_THREAD

Adds bit perf_event_attr::inherit_thread, to restricting inheriting
events only if the child was cloned with CLONE_THREAD.

This option supports the case where an event is supposed to be
process-wide only (including subthreads), but should not propagate
beyond the current process's shared environment.

Suggested-by: Peter Zijlstra 
Signed-off-by: Marco Elver 
Signed-off-by: Peter Zijlstra (Intel) 
Link: 
https://lore.kernel.org/lkml/ybvj6ejr%2fdy2t...@hirez.programming.kicks-ass.net/
---
 include/linux/perf_event.h  |  5 +++--
 include/uapi/linux/perf_event.h |  3 ++-
 kernel/events/core.c| 21 ++---
 kernel/fork.c   |  2 +-
 4 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 3d478ab..1660039 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -958,7 +958,7 @@ extern void __perf_event_task_sched_in(struct task_struct 
*prev,
   struct task_struct *task);
 extern void __perf_event_task_sched_out(struct task_struct *prev,
struct task_struct *next);
-extern int perf_event_init_task(struct task_struct *child);
+extern int perf_event_init_task(struct task_struct *child, u64 clone_flags);
 extern void perf_event_exit_task(struct task_struct *child);
 extern void perf_event_free_task(struct task_struct *task);
 extern void perf_event_delayed_put(struct task_struct *task);
@@ -1449,7 +1449,8 @@ perf_event_task_sched_in(struct task_struct *prev,
 static inline void
 perf_event_task_sched_out(struct task_struct *prev,
  struct task_struct *next) { }
-static inline int perf_event_init_task(struct task_struct *child)  { 
return 0; }
+static inline int perf_event_init_task(struct task_struct *child,
+  u64 clone_flags) { 
return 0; }
 static inline void perf_event_exit_task(struct task_struct *child) { }
 static inline void perf_event_free_task(struct task_struct *task)  { }
 static inline void perf_event_delayed_put(struct task_struct *task){ }
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index ad15e40..813efb6 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -389,7 +389,8 @@ struct perf_event_attr {
cgroup :  1, /* include cgroup events */
text_poke  :  1, /* include text poke 
events */
build_id   :  1, /* use build id in mmap2 
events */
-   __reserved_1   : 29;
+   inherit_thread :  1, /* children only inherit 
if cloned with CLONE_THREAD */
+   __reserved_1   : 28;
 
union {
__u32   wakeup_events;/* wakeup every n events */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 10ed2cd..3e3c00f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -11653,6 +11653,9 @@ static int perf_copy_attr(struct perf_event_attr __user 
*uattr,
(attr->sample_type & PERF_SAMPLE_WEIGHT_STRUCT))
return -EINVAL;
 
+   if (!attr->inherit && attr->inherit_thread)
+   return -EINVAL;
+
 out:
return ret;
 
@@ -12873,12 +12876,13 @@ static int
 inherit_task_group(struct perf_event *event, struct task_struct *parent,
   struct perf_event_context *parent_ctx,
   struct task_struct *child, int ctxn,
-  int *inherited_all)
+  u64 clone_flags, int *inherited_all)
 {
int ret;
struct perf_event_context *child_ctx;
 
-   if (!event->attr.inherit) {
+   if (!event->attr.inherit ||
+   (event->attr.inherit_thread && !(clone_flags & CLONE_THREAD))) {
*inherited_all = 0;
return 0;
}
@@ -12910,7 +12914,8 @@ inherit_task_group(struct perf_event *event, struct 
task_struct *parent,
 /*
  * Initialize the perf_event context in task_struct
  */
-static int perf_event_init_context(struct task_struct *child, int ctxn)
+static int perf_event_init_context(struct task_struct *child, int ctxn,
+  u64 clone_flags)
 {
struct perf_event_context *child_ctx, *parent_ctx;
struct perf_event_context *clon

[tip: perf/core] signal: Introduce TRAP_PERF si_code and si_perf to siginfo

2021-04-16 Thread tip-bot2 for Marco Elver
The following commit has been merged into the perf/core branch of tip:

Commit-ID: fb6cc127e0b6e629252cdd0f77d5a1f49db95b92
Gitweb:
https://git.kernel.org/tip/fb6cc127e0b6e629252cdd0f77d5a1f49db95b92
Author:Marco Elver 
AuthorDate:Thu, 08 Apr 2021 12:36:00 +02:00
Committer: Peter Zijlstra 
CommitterDate: Fri, 16 Apr 2021 16:32:41 +02:00

signal: Introduce TRAP_PERF si_code and si_perf to siginfo

Introduces the TRAP_PERF si_code, and associated siginfo_t field
si_perf. These will be used by the perf event subsystem to send signals
(if requested) to the task where an event occurred.

Signed-off-by: Marco Elver 
Signed-off-by: Peter Zijlstra (Intel) 
Acked-by: Geert Uytterhoeven  # m68k
Acked-by: Arnd Bergmann  # asm-generic
Link: https://lkml.kernel.org/r/20210408103605.1676875-6-el...@google.com
---
 arch/m68k/kernel/signal.c  |  3 +++
 arch/x86/kernel/signal_compat.c|  5 -
 fs/signalfd.c  |  4 
 include/linux/compat.h |  2 ++
 include/linux/signal.h |  1 +
 include/uapi/asm-generic/siginfo.h |  6 +-
 include/uapi/linux/signalfd.h  |  4 +++-
 kernel/signal.c| 11 +++
 8 files changed, 33 insertions(+), 3 deletions(-)

diff --git a/arch/m68k/kernel/signal.c b/arch/m68k/kernel/signal.c
index 349570f..a4b7ee1 100644
--- a/arch/m68k/kernel/signal.c
+++ b/arch/m68k/kernel/signal.c
@@ -622,6 +622,9 @@ static inline void siginfo_build_tests(void)
/* _sigfault._addr_pkey */
BUILD_BUG_ON(offsetof(siginfo_t, si_pkey) != 0x12);
 
+   /* _sigfault._perf */
+   BUILD_BUG_ON(offsetof(siginfo_t, si_perf) != 0x10);
+
/* _sigpoll */
BUILD_BUG_ON(offsetof(siginfo_t, si_band)   != 0x0c);
BUILD_BUG_ON(offsetof(siginfo_t, si_fd) != 0x10);
diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c
index a5330ff..0e5d0a7 100644
--- a/arch/x86/kernel/signal_compat.c
+++ b/arch/x86/kernel/signal_compat.c
@@ -29,7 +29,7 @@ static inline void signal_compat_build_tests(void)
BUILD_BUG_ON(NSIGFPE  != 15);
BUILD_BUG_ON(NSIGSEGV != 9);
BUILD_BUG_ON(NSIGBUS  != 5);
-   BUILD_BUG_ON(NSIGTRAP != 5);
+   BUILD_BUG_ON(NSIGTRAP != 6);
BUILD_BUG_ON(NSIGCHLD != 6);
BUILD_BUG_ON(NSIGSYS  != 2);
 
@@ -138,6 +138,9 @@ static inline void signal_compat_build_tests(void)
BUILD_BUG_ON(offsetof(siginfo_t, si_pkey) != 0x20);
BUILD_BUG_ON(offsetof(compat_siginfo_t, si_pkey) != 0x14);
 
+   BUILD_BUG_ON(offsetof(siginfo_t, si_perf) != 0x18);
+   BUILD_BUG_ON(offsetof(compat_siginfo_t, si_perf) != 0x10);
+
CHECK_CSI_OFFSET(_sigpoll);
CHECK_CSI_SIZE  (_sigpoll, 2*sizeof(int));
CHECK_SI_SIZE   (_sigpoll, 4*sizeof(int));
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 456046e..040a114 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -134,6 +134,10 @@ static int signalfd_copyinfo(struct signalfd_siginfo 
__user *uinfo,
 #endif
new.ssi_addr_lsb = (short) kinfo->si_addr_lsb;
break;
+   case SIL_PERF_EVENT:
+   new.ssi_addr = (long) kinfo->si_addr;
+   new.ssi_perf = kinfo->si_perf;
+   break;
case SIL_CHLD:
new.ssi_pid= kinfo->si_pid;
new.ssi_uid= kinfo->si_uid;
diff --git a/include/linux/compat.h b/include/linux/compat.h
index 6e65be7..c8821d9 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -236,6 +236,8 @@ typedef struct compat_siginfo {
char 
_dummy_pkey[__COMPAT_ADDR_BND_PKEY_PAD];
u32 _pkey;
} _addr_pkey;
+   /* used when si_code=TRAP_PERF */
+   compat_u64 _perf;
};
} _sigfault;
 
diff --git a/include/linux/signal.h b/include/linux/signal.h
index 205526c..1e98548 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -43,6 +43,7 @@ enum siginfo_layout {
SIL_FAULT_MCEERR,
SIL_FAULT_BNDERR,
SIL_FAULT_PKUERR,
+   SIL_PERF_EVENT,
SIL_CHLD,
SIL_RT,
SIL_SYS,
diff --git a/include/uapi/asm-generic/siginfo.h 
b/include/uapi/asm-generic/siginfo.h
index d259700..d0bb912 100644
--- a/include/uapi/asm-generic/siginfo.h
+++ b/include/uapi/asm-generic/siginfo.h
@@ -91,6 +91,8 @@ union __sifields {
char _dummy_pkey[__ADDR_BND_PKEY_PAD];
__u32 _pkey;
} _addr_pkey;
+   /* used when si_code=TRAP_PERF */
+   __u64 _perf;
};
} _sigfault;
 
@@ -155,6 +157,7 @@ typedef struct siginfo {
 #define si_lower   _sifields._sigfault._addr_bnd._lower
 #define si_upper   _sifields._sigfault._addr_bnd._upper
 

[tip: perf/core] perf: Add support for SIGTRAP on perf events

2021-04-16 Thread tip-bot2 for Marco Elver
The following commit has been merged into the perf/core branch of tip:

Commit-ID: 97ba62b278674293762c3d91f724f1bb922f04e0
Gitweb:
https://git.kernel.org/tip/97ba62b278674293762c3d91f724f1bb922f04e0
Author:Marco Elver 
AuthorDate:Thu, 08 Apr 2021 12:36:01 +02:00
Committer: Peter Zijlstra 
CommitterDate: Fri, 16 Apr 2021 16:32:41 +02:00

perf: Add support for SIGTRAP on perf events

Adds bit perf_event_attr::sigtrap, which can be set to cause events to
send SIGTRAP (with si_code TRAP_PERF) to the task where the event
occurred. The primary motivation is to support synchronous signals on
perf events in the task where an event (such as breakpoints) triggered.

To distinguish perf events based on the event type, the type is set in
si_errno. For events that are associated with an address, si_addr is
copied from perf_sample_data.

The new field perf_event_attr::sig_data is copied to si_perf, which
allows user space to disambiguate which event (of the same type)
triggered the signal. For example, user space could encode the relevant
information it cares about in sig_data.

We note that the choice of an opaque u64 provides the simplest and most
flexible option. Alternatives where a reference to some user space data
is passed back suffer from the problem that modification of referenced
data (be it the event fd, or the perf_event_attr) can race with the
signal being delivered (of course, the same caveat applies if user space
decides to store a pointer in sig_data, but the ABI explicitly avoids
prescribing such a design).

Suggested-by: Peter Zijlstra 
Signed-off-by: Marco Elver 
Signed-off-by: Peter Zijlstra (Intel) 
Acked-by: Dmitry Vyukov 
Link: 
https://lore.kernel.org/lkml/ybv3rat566k+6...@hirez.programming.kicks-ass.net/
---
 include/linux/perf_event.h  |  1 +-
 include/uapi/linux/perf_event.h | 10 ++-
 kernel/events/core.c| 49 +++-
 3 files changed, 58 insertions(+), 2 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 1660039..7d7280a 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -735,6 +735,7 @@ struct perf_event {
int pending_wakeup;
int pending_kill;
int pending_disable;
+   unsigned long   pending_addr;   /* SIGTRAP */
struct irq_work pending;
 
atomic_tevent_limit;
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 8c5b9f5..31b00e3 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -311,6 +311,7 @@ enum perf_event_read_format {
 #define PERF_ATTR_SIZE_VER4104 /* add: sample_regs_intr */
 #define PERF_ATTR_SIZE_VER5112 /* add: aux_watermark */
 #define PERF_ATTR_SIZE_VER6120 /* add: aux_sample_size */
+#define PERF_ATTR_SIZE_VER7128 /* add: sig_data */
 
 /*
  * Hardware event_id to monitor via a performance monitoring event:
@@ -391,7 +392,8 @@ struct perf_event_attr {
build_id   :  1, /* use build id in mmap2 
events */
inherit_thread :  1, /* children only inherit 
if cloned with CLONE_THREAD */
remove_on_exec :  1, /* event is removed from 
task on exec */
-   __reserved_1   : 27;
+   sigtrap:  1, /* send synchronous 
SIGTRAP on event */
+   __reserved_1   : 26;
 
union {
__u32   wakeup_events;/* wakeup every n events */
@@ -443,6 +445,12 @@ struct perf_event_attr {
__u16   __reserved_2;
__u32   aux_sample_size;
__u32   __reserved_3;
+
+   /*
+* User provided data if sigtrap=1, passed back to user via
+* siginfo_t::si_perf, e.g. to permit user to identify the event.
+*/
+   __u64   sig_data;
 };
 
 /*
diff --git a/kernel/events/core.c b/kernel/events/core.c
index e4a584b..6f0723c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6392,6 +6392,33 @@ void perf_event_wakeup(struct perf_event *event)
}
 }
 
+static void perf_sigtrap(struct perf_event *event)
+{
+   struct kernel_siginfo info;
+
+   /*
+* We'd expect this to only occur if the irq_work is delayed and either
+* ctx->task or current has changed in the meantime. This can be the
+* case on architectures that do not implement arch_irq_work_raise().
+*/
+   if (WARN_ON_ONCE(event->ctx->task != current))
+   return;
+
+   /*
+* perf_pending_event() can race with the task exiting.
+*/
+   if (current->flags & PF_EXITING)
+   return;
+
+   clear_siginfo();
+   info.si_signo = SIGTRAP;
+   info.si_

[tip: perf/core] perf: Add support for event removal on exec

2021-04-16 Thread tip-bot2 for Marco Elver
The following commit has been merged into the perf/core branch of tip:

Commit-ID: 2e498d0a74e5b88a6689ae1b811f247f91ff188e
Gitweb:
https://git.kernel.org/tip/2e498d0a74e5b88a6689ae1b811f247f91ff188e
Author:Marco Elver 
AuthorDate:Thu, 08 Apr 2021 12:35:59 +02:00
Committer: Peter Zijlstra 
CommitterDate: Fri, 16 Apr 2021 16:32:41 +02:00

perf: Add support for event removal on exec

Adds bit perf_event_attr::remove_on_exec, to support removing an event
from a task on exec.

This option supports the case where an event is supposed to be
process-wide only, and should not propagate beyond exec, to limit
monitoring to the original process image only.

Suggested-by: Peter Zijlstra 
Signed-off-by: Marco Elver 
Signed-off-by: Peter Zijlstra (Intel) 
Link: https://lkml.kernel.org/r/20210408103605.1676875-5-el...@google.com
---
 include/uapi/linux/perf_event.h |  3 +-
 kernel/events/core.c| 70 
 2 files changed, 64 insertions(+), 9 deletions(-)

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 813efb6..8c5b9f5 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -390,7 +390,8 @@ struct perf_event_attr {
text_poke  :  1, /* include text poke 
events */
build_id   :  1, /* use build id in mmap2 
events */
inherit_thread :  1, /* children only inherit 
if cloned with CLONE_THREAD */
-   __reserved_1   : 28;
+   remove_on_exec :  1, /* event is removed from 
task on exec */
+   __reserved_1   : 27;
 
union {
__u32   wakeup_events;/* wakeup every n events */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 3e3c00f..e4a584b 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4248,6 +4248,57 @@ out:
put_ctx(clone_ctx);
 }
 
+static void perf_remove_from_owner(struct perf_event *event);
+static void perf_event_exit_event(struct perf_event *event,
+ struct perf_event_context *ctx);
+
+/*
+ * Removes all events from the current task that have been marked
+ * remove-on-exec, and feeds their values back to parent events.
+ */
+static void perf_event_remove_on_exec(int ctxn)
+{
+   struct perf_event_context *ctx, *clone_ctx = NULL;
+   struct perf_event *event, *next;
+   LIST_HEAD(free_list);
+   unsigned long flags;
+   bool modified = false;
+
+   ctx = perf_pin_task_context(current, ctxn);
+   if (!ctx)
+   return;
+
+   mutex_lock(>mutex);
+
+   if (WARN_ON_ONCE(ctx->task != current))
+   goto unlock;
+
+   list_for_each_entry_safe(event, next, >event_list, event_entry) {
+   if (!event->attr.remove_on_exec)
+   continue;
+
+   if (!is_kernel_event(event))
+   perf_remove_from_owner(event);
+
+   modified = true;
+
+   perf_event_exit_event(event, ctx);
+   }
+
+   raw_spin_lock_irqsave(>lock, flags);
+   if (modified)
+   clone_ctx = unclone_ctx(ctx);
+   --ctx->pin_count;
+   raw_spin_unlock_irqrestore(>lock, flags);
+
+unlock:
+   mutex_unlock(>mutex);
+
+   put_ctx(ctx);
+   if (clone_ctx)
+   put_ctx(clone_ctx);
+}
+
 struct perf_read_data {
struct perf_event *event;
bool group;
@@ -7560,18 +7611,18 @@ void perf_event_exec(void)
struct perf_event_context *ctx;
int ctxn;
 
-   rcu_read_lock();
for_each_task_context_nr(ctxn) {
-   ctx = current->perf_event_ctxp[ctxn];
-   if (!ctx)
-   continue;
-
perf_event_enable_on_exec(ctxn);
+   perf_event_remove_on_exec(ctxn);
 
-   perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL,
-  true);
+   rcu_read_lock();
+   ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
+   if (ctx) {
+   perf_iterate_ctx(ctx, perf_event_addr_filters_exec,
+NULL, true);
+   }
+   rcu_read_unlock();
}
-   rcu_read_unlock();
 }
 
 struct remote_output {
@@ -11656,6 +11707,9 @@ static int perf_copy_attr(struct perf_event_attr __user 
*uattr,
if (!attr->inherit && attr->inherit_thread)
return -EINVAL;
 
+   if (attr->remove_on_exec && attr->enable_on_exec)
+   return -EINVAL;
+
 out:
return ret;
 


[tip: perf/core] selftests/perf_events: Add kselftest for process-wide sigtrap handling

2021-04-16 Thread tip-bot2 for Marco Elver
The following commit has been merged into the perf/core branch of tip:

Commit-ID: f2c3c32f45002de19c6dec33f32fd259e82f2557
Gitweb:
https://git.kernel.org/tip/f2c3c32f45002de19c6dec33f32fd259e82f2557
Author:Marco Elver 
AuthorDate:Thu, 08 Apr 2021 12:36:02 +02:00
Committer: Peter Zijlstra 
CommitterDate: Fri, 16 Apr 2021 16:32:42 +02:00

selftests/perf_events: Add kselftest for process-wide sigtrap handling

Add a kselftest for testing process-wide perf events with synchronous
SIGTRAP on events (using breakpoints). In particular, we want to test
that changes to the event propagate to all children, and the SIGTRAPs
are in fact synchronously sent to the thread where the event occurred.

Note: The "signal_stress" test case is also added later in the series to
perf tool's built-in tests. The test here is more elaborate in that
respect, which on one hand avoids bloating the perf tool unnecessarily,
but we also benefit from structured tests with TAP-compliant output that
the kselftest framework provides.

Signed-off-by: Marco Elver 
Signed-off-by: Peter Zijlstra (Intel) 
Link: https://lkml.kernel.org/r/20210408103605.1676875-8-el...@google.com
---
 tools/testing/selftests/perf_events/.gitignore|   2 +-
 tools/testing/selftests/perf_events/Makefile  |   6 +-
 tools/testing/selftests/perf_events/config|   1 +-
 tools/testing/selftests/perf_events/settings  |   1 +-
 tools/testing/selftests/perf_events/sigtrap_threads.c | 210 +-
 5 files changed, 220 insertions(+)
 create mode 100644 tools/testing/selftests/perf_events/.gitignore
 create mode 100644 tools/testing/selftests/perf_events/Makefile
 create mode 100644 tools/testing/selftests/perf_events/config
 create mode 100644 tools/testing/selftests/perf_events/settings
 create mode 100644 tools/testing/selftests/perf_events/sigtrap_threads.c

diff --git a/tools/testing/selftests/perf_events/.gitignore 
b/tools/testing/selftests/perf_events/.gitignore
new file mode 100644
index 000..4dc43e1
--- /dev/null
+++ b/tools/testing/selftests/perf_events/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+sigtrap_threads
diff --git a/tools/testing/selftests/perf_events/Makefile 
b/tools/testing/selftests/perf_events/Makefile
new file mode 100644
index 000..973a2c3
--- /dev/null
+++ b/tools/testing/selftests/perf_events/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0
+CFLAGS += -Wl,-no-as-needed -Wall -I../../../../usr/include
+LDFLAGS += -lpthread
+
+TEST_GEN_PROGS := sigtrap_threads
+include ../lib.mk
diff --git a/tools/testing/selftests/perf_events/config 
b/tools/testing/selftests/perf_events/config
new file mode 100644
index 000..ba58ff2
--- /dev/null
+++ b/tools/testing/selftests/perf_events/config
@@ -0,0 +1 @@
+CONFIG_PERF_EVENTS=y
diff --git a/tools/testing/selftests/perf_events/settings 
b/tools/testing/selftests/perf_events/settings
new file mode 100644
index 000..6091b45
--- /dev/null
+++ b/tools/testing/selftests/perf_events/settings
@@ -0,0 +1 @@
+timeout=120
diff --git a/tools/testing/selftests/perf_events/sigtrap_threads.c 
b/tools/testing/selftests/perf_events/sigtrap_threads.c
new file mode 100644
index 000..9c0fd44
--- /dev/null
+++ b/tools/testing/selftests/perf_events/sigtrap_threads.c
@@ -0,0 +1,210 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test for perf events with SIGTRAP across all threads.
+ *
+ * Copyright (C) 2021, Google LLC.
+ */
+
+#define _GNU_SOURCE
+
+/* We need the latest siginfo from the kernel repo. */
+#include 
+#include 
+#define __have_siginfo_t 1
+#define __have_sigval_t 1
+#define __have_sigevent_t 1
+#define __siginfo_t_defined
+#define __sigval_t_defined
+#define __sigevent_t_defined
+#define _BITS_SIGINFO_CONSTS_H 1
+#define _BITS_SIGEVENT_CONSTS_H 1
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "../kselftest_harness.h"
+
+#define NUM_THREADS 5
+
+/* Data shared between test body, threads, and signal handler. */
+static struct {
+   int tids_want_signal;   /* Which threads still want a signal. */
+   int signal_count;   /* Sanity check number of signals 
received. */
+   volatile int iterate_on;/* Variable to set breakpoint on. */
+   siginfo_t first_siginfo;/* First observed siginfo_t. */
+} ctx;
+
+/* Unique value to check si_perf is correctly set from 
perf_event_attr::sig_data. */
+#define TEST_SIG_DATA(addr) (~(uint64_t)(addr))
+
+static struct perf_event_attr make_event_attr(bool enabled, volatile void 
*addr)
+{
+   struct perf_event_attr attr = {
+   .type   = PERF_TYPE_BREAKPOINT,
+   .size   = sizeof(attr),
+   .sample_period  = 1,
+   .disabled   = !enabled,
+   .bp_addr= (unsigned long)addr,
+   .bp_type  

[tip: perf/core] selftests/perf_events: Add kselftest for remove_on_exec

2021-04-16 Thread tip-bot2 for Marco Elver
The following commit has been merged into the perf/core branch of tip:

Commit-ID: 6216798bf98e82c382922f1b71ecc4a13d6e65cb
Gitweb:
https://git.kernel.org/tip/6216798bf98e82c382922f1b71ecc4a13d6e65cb
Author:Marco Elver 
AuthorDate:Thu, 08 Apr 2021 12:36:03 +02:00
Committer: Peter Zijlstra 
CommitterDate: Fri, 16 Apr 2021 16:32:42 +02:00

selftests/perf_events: Add kselftest for remove_on_exec

Add kselftest to test that remove_on_exec removes inherited events from
child tasks.

Signed-off-by: Marco Elver 
Signed-off-by: Peter Zijlstra (Intel) 
Link: https://lkml.kernel.org/r/20210408103605.1676875-9-el...@google.com
---
 tools/testing/selftests/perf_events/.gitignore   |   1 +-
 tools/testing/selftests/perf_events/Makefile |   2 +-
 tools/testing/selftests/perf_events/remove_on_exec.c | 260 ++-
 3 files changed, 262 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/perf_events/remove_on_exec.c

diff --git a/tools/testing/selftests/perf_events/.gitignore 
b/tools/testing/selftests/perf_events/.gitignore
index 4dc43e1..790c470 100644
--- a/tools/testing/selftests/perf_events/.gitignore
+++ b/tools/testing/selftests/perf_events/.gitignore
@@ -1,2 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0-only
 sigtrap_threads
+remove_on_exec
diff --git a/tools/testing/selftests/perf_events/Makefile 
b/tools/testing/selftests/perf_events/Makefile
index 973a2c3..fcafa5f 100644
--- a/tools/testing/selftests/perf_events/Makefile
+++ b/tools/testing/selftests/perf_events/Makefile
@@ -2,5 +2,5 @@
 CFLAGS += -Wl,-no-as-needed -Wall -I../../../../usr/include
 LDFLAGS += -lpthread
 
-TEST_GEN_PROGS := sigtrap_threads
+TEST_GEN_PROGS := sigtrap_threads remove_on_exec
 include ../lib.mk
diff --git a/tools/testing/selftests/perf_events/remove_on_exec.c 
b/tools/testing/selftests/perf_events/remove_on_exec.c
new file mode 100644
index 000..5814611
--- /dev/null
+++ b/tools/testing/selftests/perf_events/remove_on_exec.c
@@ -0,0 +1,260 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test for remove_on_exec.
+ *
+ * Copyright (C) 2021, Google LLC.
+ */
+
+#define _GNU_SOURCE
+
+/* We need the latest siginfo from the kernel repo. */
+#include 
+#include 
+#define __have_siginfo_t 1
+#define __have_sigval_t 1
+#define __have_sigevent_t 1
+#define __siginfo_t_defined
+#define __sigval_t_defined
+#define __sigevent_t_defined
+#define _BITS_SIGINFO_CONSTS_H 1
+#define _BITS_SIGEVENT_CONSTS_H 1
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "../kselftest_harness.h"
+
+static volatile int signal_count;
+
+static struct perf_event_attr make_event_attr(void)
+{
+   struct perf_event_attr attr = {
+   .type   = PERF_TYPE_HARDWARE,
+   .size   = sizeof(attr),
+   .config = PERF_COUNT_HW_INSTRUCTIONS,
+   .sample_period  = 1000,
+   .exclude_kernel = 1,
+   .exclude_hv = 1,
+   .disabled   = 1,
+   .inherit= 1,
+   /*
+* Children normally retain their inherited event on exec; with
+* remove_on_exec, we'll remove their event, but the parent and
+* any other non-exec'd children will keep their events.
+*/
+   .remove_on_exec = 1,
+   .sigtrap= 1,
+   };
+   return attr;
+}
+
+static void sigtrap_handler(int signum, siginfo_t *info, void *ucontext)
+{
+   if (info->si_code != TRAP_PERF) {
+   fprintf(stderr, "%s: unexpected si_code %d\n", __func__, 
info->si_code);
+   return;
+   }
+
+   signal_count++;
+}
+
+FIXTURE(remove_on_exec)
+{
+   struct sigaction oldact;
+   int fd;
+};
+
+FIXTURE_SETUP(remove_on_exec)
+{
+   struct perf_event_attr attr = make_event_attr();
+   struct sigaction action = {};
+
+   signal_count = 0;
+
+   /* Initialize sigtrap handler. */
+   action.sa_flags = SA_SIGINFO | SA_NODEFER;
+   action.sa_sigaction = sigtrap_handler;
+   sigemptyset(_mask);
+   ASSERT_EQ(sigaction(SIGTRAP, , >oldact), 0);
+
+   /* Initialize perf event. */
+   self->fd = syscall(__NR_perf_event_open, , 0, -1, -1, 
PERF_FLAG_FD_CLOEXEC);
+   ASSERT_NE(self->fd, -1);
+}
+
+FIXTURE_TEARDOWN(remove_on_exec)
+{
+   close(self->fd);
+   sigaction(SIGTRAP, >oldact, NULL);
+}
+
+/* Verify event propagates to fork'd child. */
+TEST_F(remove_on_exec, fork_only)
+{
+   int status;
+   pid_t pid = fork();
+
+   if (pid == 0) {
+   ASSERT_EQ(signal_count, 0);
+   ASSERT_EQ(ioctl(self->fd, PERF_EVENT_IOC_ENABLE, 0), 0);
+   while (!signal_count);
+   _exit(42);
+   }
+
+   while (!signal_count); /* Child enables event. */
+   EXPECT_EQ(waitpid(pid, , 0)

Re: [PATCH v4 2/3] mm/slub, kunit: add a KUnit test for SLUB debugging functionality

2021-04-15 Thread Marco Elver
On Thu, 15 Apr 2021 at 12:38, Vlastimil Babka  wrote:
>
> On 4/15/21 12:10 PM, Oliver Glitta wrote:
> > ut 13. 4. 2021 o 15:54 Marco Elver  napísal(a):
> >>
> >> On Tue, 13 Apr 2021 at 12:07,  wrote:
> >> > From: Oliver Glitta 
> >> >
> >> > SLUB has resiliency_test() function which is hidden behind #ifdef
> >> > SLUB_RESILIENCY_TEST that is not part of Kconfig, so nobody
> >> > runs it. KUnit should be a proper replacement for it.
> >> >
> >> > Try changing byte in redzone after allocation and changing
> >> > pointer to next free node, first byte, 50th byte and redzone
> >> > byte. Check if validation finds errors.
> >> >
> >> > There are several differences from the original resiliency test:
> >> > Tests create own caches with known state instead of corrupting
> >> > shared kmalloc caches.
> >> >
> >> > The corruption of freepointer uses correct offset, the original
> >> > resiliency test got broken with freepointer changes.
> >> >
> >> > Scratch changing random byte test, because it does not have
> >> > meaning in this form where we need deterministic results.
> >> >
> >> > Add new option CONFIG_SLUB_KUNIT_TEST in Kconfig.
> >> > Because the test deliberatly modifies non-allocated objects, it depends 
> >> > on
> >> > !KASAN which would have otherwise prevented that.
> >>
> >> Hmm, did the test fail with KASAN? Is it possible to skip the tests
> >> and still run a subset of tests with KASAN? It'd be nice if we could
> >> run some of these tests with KASAN as well.
> >>
> >> > Use kunit_resource to count errors in cache and silence bug reports.
> >> > Count error whenever slab_bug() or slab_fix() is called or when
> >> > the count of pages is wrong.
> >> >
> >> > Signed-off-by: Oliver Glitta 
> >>
> >> Reviewed-by: Marco Elver 
> >>
> >
> > Thank you.
> >
> >> Thanks, this all looks good to me. But perhaps do test what works with
> >> KASAN, to see if you need the !KASAN constraint for all cases.
> >
> > I tried to run tests with KASAN functionality disabled with function
> > kasan_disable_current() and three of the tests failed with wrong
> > errors counts.
> > So I add the !KASAN constraint for all tests, because the merge window
> > is coming, we want to know if this version is stable and without other
> > mistakes.
> > We will take a closer look at that in the follow-up patch.
>
> Agreed. In this context, KASAN is essentially a different implementation of 
> the
> same checks that SLUB_DEBUG offers (and also does other checks) and we 
> excercise
> these SLUB_DEBUG checks by deliberately causing the corruption that they 
> detect
> - so instead, KASAN detects it, as it should. I assume that once somebody opts
> for a full KASAN kernel build, they don't need the SLUB_DEBUG functionality at
> that point, as KASAN is more extensive (On the other hand SLUB_DEBUG kernels 
> can
> be (and are) shipped as production distro kernels where specific targetted
> debugging can be enabled to help find bugs in production with minimal 
> disruption).
> So trying to make both cooperate can work only to some extent and for now 
> we've
> chosen the safer way.

Sounds reasonable. In any case, I'm fine with this version to land and
my Reviewed-by above remains valid. :-)

Thanks,
-- Marco


Re: [peterz-queue:perf/core 18/22] kernel/events/core.c:6418:22: sparse: sparse: incorrect type in assignment (different address spaces)

2021-04-15 Thread Marco Elver
On Thu, Apr 15, 2021 at 10:48AM +0200, Peter Zijlstra wrote:
> On Wed, Apr 14, 2021 at 04:33:22PM +0200, Marco Elver wrote:
> > On Wed, Apr 14, 2021 at 10:10PM +0800, kernel test robot wrote:
> > > tree:   https://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git 
> > > perf/core
> > > head:   0da503cd07380952599b67ded6efe030d78ea42d
> > > commit: c7d4112e9f0e69edd649665836ce72008b95ab9f [18/22] perf: Add 
> > > support for SIGTRAP on perf events
> > [...]
> > > If you fix the issue, kindly add following tag as appropriate
> > > Reported-by: kernel test robot 
> > [...]
> > >   6416info.si_errno = event->attr.type;
> > >   6417info.si_perf = event->attr.sig_data;
> > > > 6418info.si_addr = (void *)event->sig_addr;
> > >   6419force_sig_info();
> > 
> > I think it wants the below (feel free to squash into "perf: Add support
> > for SIGTRAP on perf events").
> > 
> > Thanks,
> > -- Marco
> > 
[...]
> 
> Now the silly robot complains about:
> 
> CC  kernel/events/core.o
> ../kernel/events/core.c: In function ‘perf_sigtrap’:
> ../kernel/events/core.c:6418:17: warning: cast to pointer from integer of 
> different size [-Wint-to-pointer-cast]
> 6418 |  info.si_addr = (void __user *)event->sig_addr;
> 
> for all 32bit builds (because sig_addr is u64 and the pointer cast
> truncates bits).
> 
> This had me look a little harder at sig_addr and I figured it should be
> next to the pending fields for cache locality.
> 
> I've ended up with the below delta, does that work for you?

Thanks, that works for me. Do note that I explicitly chose u64 for
sig_addr/pending_addr because data->addr is u64. There might be a new
warning about the u64 to unsigned long assignment on 32 bit arches.

Perhaps it needs something ugly like this:

info.si_addr = (void __user *)(unsigned long)event->pending_addr;

if pending_addr wants to be u64. Or just

event->pending_addr = (unsigned long)data->addr;

if data->addr being u64 on 32 bit arches is simply overkill.

Thanks,
-- Marco

> ---
> 
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -735,6 +735,7 @@ struct perf_event {
>   int pending_wakeup;
>   int pending_kill;
>   int pending_disable;
> + unsigned long   pending_addr;   /* SIGTRAP */
>   struct irq_work pending;
>  
>   atomic_tevent_limit;
> @@ -778,9 +779,6 @@ struct perf_event {
>   void *security;
>  #endif
>   struct list_headsb_list;
> -
> - /* Address associated with event, which can be passed to siginfo_t. */
> - u64 sig_addr;
>  #endif /* CONFIG_PERF_EVENTS */
>  };
>  
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -6415,7 +6415,7 @@ static void perf_sigtrap(struct perf_eve
>   info.si_code = TRAP_PERF;
>   info.si_errno = event->attr.type;
>   info.si_perf = event->attr.sig_data;
> - info.si_addr = (void __user *)event->sig_addr;
> + info.si_addr = (void __user *)event->pending_addr;
>   force_sig_info();
>  }
>  
> @@ -9137,7 +9137,7 @@ static int __perf_event_overflow(struct
>   if (events && atomic_dec_and_test(>event_limit)) {
>   ret = 1;
>   event->pending_kill = POLL_HUP;
> - event->sig_addr = data->addr;
> + event->pending_addr = data->addr;
>  
>   perf_event_disable_inatomic(event);
>   }


Re: [peterz-queue:perf/core 18/22] kernel/events/core.c:6418:22: sparse: sparse: incorrect type in assignment (different address spaces)

2021-04-14 Thread Marco Elver
On Wed, Apr 14, 2021 at 10:10PM +0800, kernel test robot wrote:
> tree:   https://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git 
> perf/core
> head:   0da503cd07380952599b67ded6efe030d78ea42d
> commit: c7d4112e9f0e69edd649665836ce72008b95ab9f [18/22] perf: Add support 
> for SIGTRAP on perf events
[...]
> If you fix the issue, kindly add following tag as appropriate
> Reported-by: kernel test robot 
[...]
>   6416info.si_errno = event->attr.type;
>   6417info.si_perf = event->attr.sig_data;
> > 6418info.si_addr = (void *)event->sig_addr;
>   6419force_sig_info();

I think it wants the below (feel free to squash into "perf: Add support
for SIGTRAP on perf events").

Thanks,
-- Marco

-- >8 --

From: Marco Elver 
Date: Wed, 14 Apr 2021 16:26:26 +0200
Subject: [PATCH] perf: Fix cast to void __user pointer

sparse let us know that si_addr is 'void __user *', therefore add the
missing __user attribute to the cast.

Reported-by: kernel test robot 
Signed-off-by: Marco Elver 
---
 kernel/events/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 1d2077389c0c..2677438ed668 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6414,7 +6414,7 @@ static void perf_sigtrap(struct perf_event *event)
info.si_code = TRAP_PERF;
info.si_errno = event->attr.type;
info.si_perf = event->attr.sig_data;
-   info.si_addr = (void *)event->sig_addr;
+   info.si_addr = (void __user *)event->sig_addr;
force_sig_info();
 }
 
-- 
2.31.1.295.g9ea45b61b8-goog



[PATCH 8/9] kcsan: Report observed value changes

2021-04-14 Thread Marco Elver
From: Mark Rutland 

When a thread detects that a memory location was modified without its
watchpoint being hit, the report notes that a change was detected, but
does not provide concrete values for the change. Knowing the concrete
values can be very helpful in tracking down any racy writers (e.g. as
specific values may only be written in some portions of code, or under
certain conditions).

When we detect a modification, let's report the concrete old/new values,
along with the access's mask of relevant bits (and which relevant bits
were modified). This can make it easier to identify potential racy
writers. As the snapshots are at most 8 bytes, we can only report values
for acceses up to this size, but this appears to cater for the common
case.

When we detect a race via a watchpoint, we may or may not have concrete
values for the modification. To be helpful, let's attempt to log them
when we do as they can be ignored where irrelevant.

The resulting reports appears as follows, with values zero-padded to the
access width:

| ==
| BUG: KCSAN: data-race in el0_svc_common+0x34/0x25c 
arch/arm64/kernel/syscall.c:96
|
| race at unknown origin, with read to 0x7ae6aa00 of 8 bytes by task 
223 on cpu 1:
|  el0_svc_common+0x34/0x25c arch/arm64/kernel/syscall.c:96
|  do_el0_svc+0x48/0xec arch/arm64/kernel/syscall.c:178
|  el0_svc arch/arm64/kernel/entry-common.c:226 [inline]
|  el0_sync_handler+0x1a4/0x390 arch/arm64/kernel/entry-common.c:236
|  el0_sync+0x140/0x180 arch/arm64/kernel/entry.S:674
|
| value changed: 0x -> 0x0002
|
| Reported by Kernel Concurrency Sanitizer on:
| CPU: 1 PID: 223 Comm: syz-executor.1 Not tainted 
5.8.0-rc3-00094-ga73f923ecc8e-dirty #3
| Hardware name: linux,dummy-virt (DT)
| ==

If an access mask is set, it is shown underneath the "value changed"
line as "bits changed: 0x with mask 0x".

Signed-off-by: Mark Rutland 
[ el...@google.com: align "value changed" and "bits changed" lines,
  which required massaging the message; do not print bits+mask if no
  mask set. ]
Signed-off-by: Marco Elver 
---
 kernel/kcsan/core.c   |  5 +++--
 kernel/kcsan/kcsan.h  |  6 --
 kernel/kcsan/report.c | 31 ++-
 3 files changed, 33 insertions(+), 9 deletions(-)

diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c
index 6fe1513e1e6a..26709ea65c71 100644
--- a/kernel/kcsan/core.c
+++ b/kernel/kcsan/core.c
@@ -557,7 +557,8 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t 
size, int type)

atomic_long_inc(_counters[KCSAN_COUNTER_ASSERT_FAILURES]);
 
kcsan_report_known_origin(ptr, size, type, value_change,
- watchpoint - watchpoints);
+ watchpoint - watchpoints,
+ old, new, access_mask);
} else if (value_change == KCSAN_VALUE_CHANGE_TRUE) {
/* Inferring a race, since the value should not have changed. */
 
@@ -566,7 +567,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t 
size, int type)

atomic_long_inc(_counters[KCSAN_COUNTER_ASSERT_FAILURES]);
 
if (IS_ENABLED(CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN) || 
is_assert)
-   kcsan_report_unknown_origin(ptr, size, type);
+   kcsan_report_unknown_origin(ptr, size, type, old, new, 
access_mask);
}
 
/*
diff --git a/kernel/kcsan/kcsan.h b/kernel/kcsan/kcsan.h
index 572f119a19eb..f36e25c497ed 100644
--- a/kernel/kcsan/kcsan.h
+++ b/kernel/kcsan/kcsan.h
@@ -129,12 +129,14 @@ void kcsan_report_set_info(const volatile void *ptr, 
size_t size, int access_typ
  * thread.
  */
 void kcsan_report_known_origin(const volatile void *ptr, size_t size, int 
access_type,
-  enum kcsan_value_change value_change, int 
watchpoint_idx);
+  enum kcsan_value_change value_change, int 
watchpoint_idx,
+  u64 old, u64 new, u64 mask);
 
 /*
  * No other thread was observed to race with the access, but the data value
  * before and after the stall differs. Reports a race of "unknown origin".
  */
-void kcsan_report_unknown_origin(const volatile void *ptr, size_t size, int 
access_type);
+void kcsan_report_unknown_origin(const volatile void *ptr, size_t size, int 
access_type,
+u64 old, u64 new, u64 mask);
 
 #endif /* _KERNEL_KCSAN_KCSAN_H */
diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c
index 50cee2357885..e37e4386f86d 100644
--- a/kernel/kcsan/report.c
+++ b/kernel/kcsan/report.c
@@ -327,7 +327,8 @@ static void print_verbose_info(struct task_struct *task)
 
 static void print_report(enum kcsan_value_change value_change,

[PATCH 9/9] kcsan: Document "value changed" line

2021-04-14 Thread Marco Elver
Update the example reports based on the latest reports generated by
kcsan_test module, which now include the "value changed" line. Add a
brief description of the "value changed" line.

Signed-off-by: Marco Elver 
---
 Documentation/dev-tools/kcsan.rst | 88 ---
 1 file changed, 35 insertions(+), 53 deletions(-)

diff --git a/Documentation/dev-tools/kcsan.rst 
b/Documentation/dev-tools/kcsan.rst
index d85ce238ace7..ba059df10b7d 100644
--- a/Documentation/dev-tools/kcsan.rst
+++ b/Documentation/dev-tools/kcsan.rst
@@ -27,75 +27,57 @@ Error reports
 A typical data race report looks like this::
 
 ==
-BUG: KCSAN: data-race in generic_permission / kernfs_refresh_inode
-
-write to 0x8fee4c40700c of 4 bytes by task 175 on cpu 4:
- kernfs_refresh_inode+0x70/0x170
- kernfs_iop_permission+0x4f/0x90
- inode_permission+0x190/0x200
- link_path_walk.part.0+0x503/0x8e0
- path_lookupat.isra.0+0x69/0x4d0
- filename_lookup+0x136/0x280
- user_path_at_empty+0x47/0x60
- vfs_statx+0x9b/0x130
- __do_sys_newlstat+0x50/0xb0
- __x64_sys_newlstat+0x37/0x50
- do_syscall_64+0x85/0x260
- entry_SYSCALL_64_after_hwframe+0x44/0xa9
-
-read to 0x8fee4c40700c of 4 bytes by task 166 on cpu 6:
- generic_permission+0x5b/0x2a0
- kernfs_iop_permission+0x66/0x90
- inode_permission+0x190/0x200
- link_path_walk.part.0+0x503/0x8e0
- path_lookupat.isra.0+0x69/0x4d0
- filename_lookup+0x136/0x280
- user_path_at_empty+0x47/0x60
- do_faccessat+0x11a/0x390
- __x64_sys_access+0x3c/0x50
- do_syscall_64+0x85/0x260
- entry_SYSCALL_64_after_hwframe+0x44/0xa9
+BUG: KCSAN: data-race in test_kernel_read / test_kernel_write
+
+write to 0xc009a628 of 8 bytes by task 487 on cpu 0:
+ test_kernel_write+0x1d/0x30
+ access_thread+0x89/0xd0
+ kthread+0x23e/0x260
+ ret_from_fork+0x22/0x30
+
+read to 0xc009a628 of 8 bytes by task 488 on cpu 6:
+ test_kernel_read+0x10/0x20
+ access_thread+0x89/0xd0
+ kthread+0x23e/0x260
+ ret_from_fork+0x22/0x30
+
+value changed: 0x09a6 -> 0x09b2
 
 Reported by Kernel Concurrency Sanitizer on:
-CPU: 6 PID: 166 Comm: systemd-journal Not tainted 5.3.0-rc7+ #1
-Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 
04/01/2014
+CPU: 6 PID: 488 Comm: access_thread Not tainted 5.12.0-rc2+ #1
+Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 
04/01/2014
 ==
 
 The header of the report provides a short summary of the functions involved in
 the race. It is followed by the access types and stack traces of the 2 threads
-involved in the data race.
+involved in the data race. If KCSAN also observed a value change, the observed
+old value and new value are shown on the "value changed" line respectively.
 
 The other less common type of data race report looks like this::
 
 ==
-BUG: KCSAN: data-race in e1000_clean_rx_irq+0x551/0xb10
-
-race at unknown origin, with read to 0x933db8a2ae6c of 1 bytes by 
interrupt on cpu 0:
- e1000_clean_rx_irq+0x551/0xb10
- e1000_clean+0x533/0xda0
- net_rx_action+0x329/0x900
- __do_softirq+0xdb/0x2db
- irq_exit+0x9b/0xa0
- do_IRQ+0x9c/0xf0
- ret_from_intr+0x0/0x18
- default_idle+0x3f/0x220
- arch_cpu_idle+0x21/0x30
- do_idle+0x1df/0x230
- cpu_startup_entry+0x14/0x20
- rest_init+0xc5/0xcb
- arch_call_rest_init+0x13/0x2b
- start_kernel+0x6db/0x700
+BUG: KCSAN: data-race in test_kernel_rmw_array+0x71/0xd0
+
+race at unknown origin, with read to 0xc009bdb0 of 8 bytes by task 
515 on cpu 2:
+ test_kernel_rmw_array+0x71/0xd0
+ access_thread+0x89/0xd0
+ kthread+0x23e/0x260
+ ret_from_fork+0x22/0x30
+
+value changed: 0x2328 -> 0x2329
 
 Reported by Kernel Concurrency Sanitizer on:
-CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.3.0-rc7+ #2
-Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 
04/01/2014
+CPU: 2 PID: 515 Comm: access_thread Not tainted 5.12.0-rc2+ #1
+Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 
04/01/2014
 ==
 
 This report is generated where it was not possible to determine the other
 racing thread, but a race was inferred due to the data value of the watched
-memory location having changed. These can occur either due to missing
-instrumentation or e.g. DMA accesses. These reports will only be generated if
-``CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN=y`` (selected by default).
+memory location having changed. These reports always show a "value ch

[PATCH 7/9] kcsan: Remove kcsan_report_type

2021-04-14 Thread Marco Elver
From: Mark Rutland 

Now that the reporting code has been refactored, it's clear by
construction that print_report() can only be passed
KCSAN_REPORT_RACE_SIGNAL or KCSAN_REPORT_RACE_UNKNOWN_ORIGIN, and these
can also be distinguished by the presence of `other_info`.

Let's simplify things and remove the report type enum, and instead let's
check `other_info` to distinguish these cases. This allows us to remove
code for cases which are impossible and generally makes the code simpler.

There should be no functional change as a result of this patch.

Signed-off-by: Mark Rutland 
[ el...@google.com: add updated comments to kcsan_report_*() functions ]
Signed-off-by: Marco Elver 
---
 kernel/kcsan/kcsan.h  | 33 +
 kernel/kcsan/report.c | 29 +++--
 2 files changed, 20 insertions(+), 42 deletions(-)

diff --git a/kernel/kcsan/kcsan.h b/kernel/kcsan/kcsan.h
index 2ee43fd5d6a4..572f119a19eb 100644
--- a/kernel/kcsan/kcsan.h
+++ b/kernel/kcsan/kcsan.h
@@ -116,32 +116,25 @@ enum kcsan_value_change {
KCSAN_VALUE_CHANGE_TRUE,
 };
 
-enum kcsan_report_type {
-   /*
-* The thread that set up the watchpoint and briefly stalled was
-* signalled that another thread triggered the watchpoint.
-*/
-   KCSAN_REPORT_RACE_SIGNAL,
-
-   /*
-* A thread found and consumed a matching watchpoint.
-*/
-   KCSAN_REPORT_CONSUMED_WATCHPOINT,
-
-   /*
-* No other thread was observed to race with the access, but the data
-* value before and after the stall differs.
-*/
-   KCSAN_REPORT_RACE_UNKNOWN_ORIGIN,
-};
-
 /*
- * Notify the report code that a race occurred.
+ * The calling thread hit and consumed a watchpoint: set the access information
+ * to be consumed by the reporting thread. No report is printed yet.
  */
 void kcsan_report_set_info(const volatile void *ptr, size_t size, int 
access_type,
   int watchpoint_idx);
+
+/*
+ * The calling thread observed that the watchpoint it set up was hit and
+ * consumed: print the full report based on information set by the racing
+ * thread.
+ */
 void kcsan_report_known_origin(const volatile void *ptr, size_t size, int 
access_type,
   enum kcsan_value_change value_change, int 
watchpoint_idx);
+
+/*
+ * No other thread was observed to race with the access, but the data value
+ * before and after the stall differs. Reports a race of "unknown origin".
+ */
 void kcsan_report_unknown_origin(const volatile void *ptr, size_t size, int 
access_type);
 
 #endif /* _KERNEL_KCSAN_KCSAN_H */
diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c
index ba924f110c95..50cee2357885 100644
--- a/kernel/kcsan/report.c
+++ b/kernel/kcsan/report.c
@@ -326,7 +326,6 @@ static void print_verbose_info(struct task_struct *task)
 }
 
 static void print_report(enum kcsan_value_change value_change,
-enum kcsan_report_type type,
 const struct access_info *ai,
 const struct other_info *other_info)
 {
@@ -343,7 +342,7 @@ static void print_report(enum kcsan_value_change 
value_change,
if (skip_report(KCSAN_VALUE_CHANGE_TRUE, stack_entries[skipnr]))
return;
 
-   if (type == KCSAN_REPORT_RACE_SIGNAL) {
+   if (other_info) {
other_skipnr = get_stack_skipnr(other_info->stack_entries,
other_info->num_stack_entries);
other_frame = other_info->stack_entries[other_skipnr];
@@ -358,8 +357,7 @@ static void print_report(enum kcsan_value_change 
value_change,
 
/* Print report header. */

pr_err("==\n");
-   switch (type) {
-   case KCSAN_REPORT_RACE_SIGNAL: {
+   if (other_info) {
int cmp;
 
/*
@@ -371,22 +369,15 @@ static void print_report(enum kcsan_value_change 
value_change,
   get_bug_type(ai->access_type | 
other_info->ai.access_type),
   (void *)(cmp < 0 ? other_frame : this_frame),
   (void *)(cmp < 0 ? this_frame : other_frame));
-   } break;
-
-   case KCSAN_REPORT_RACE_UNKNOWN_ORIGIN:
+   } else {
pr_err("BUG: KCSAN: %s in %pS\n", get_bug_type(ai->access_type),
   (void *)this_frame);
-   break;
-
-   default:
-   BUG();
}
 
pr_err("\n");
 
/* Print information about the racing accesses. */
-   switch (type) {
-   case KCSAN_REPORT_RACE_SIGNAL:
+   if (other_info) {
pr_err("%s to 0x%px of %zu bytes by %s on cpu %i:\n",
   get_access_type(other_info->ai.access_type), 
other_info->ai.ptr,
   other_info->ai

[PATCH 4/9] kcsan: Fold panic() call into print_report()

2021-04-14 Thread Marco Elver
From: Mark Rutland 

So that we can add more callers of print_report(), lets fold the panic()
call into print_report() so the caller doesn't have to handle this
explicitly.

There should be no functional change as a result of this patch.

Signed-off-by: Mark Rutland 
Signed-off-by: Marco Elver 
---
 kernel/kcsan/report.c | 21 -
 1 file changed, 8 insertions(+), 13 deletions(-)

diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c
index 88225f6d471e..8bfa970965a1 100644
--- a/kernel/kcsan/report.c
+++ b/kernel/kcsan/report.c
@@ -325,10 +325,7 @@ static void print_verbose_info(struct task_struct *task)
print_irqtrace_events(task);
 }
 
-/*
- * Returns true if a report was generated, false otherwise.
- */
-static bool print_report(enum kcsan_value_change value_change,
+static void print_report(enum kcsan_value_change value_change,
 enum kcsan_report_type type,
 const struct access_info *ai,
 const struct other_info *other_info)
@@ -344,7 +341,7 @@ static bool print_report(enum kcsan_value_change 
value_change,
 * Must check report filter rules before starting to print.
 */
if (skip_report(KCSAN_VALUE_CHANGE_TRUE, stack_entries[skipnr]))
-   return false;
+   return;
 
if (type == KCSAN_REPORT_RACE_SIGNAL) {
other_skipnr = get_stack_skipnr(other_info->stack_entries,
@@ -353,11 +350,11 @@ static bool print_report(enum kcsan_value_change 
value_change,
 
/* @value_change is only known for the other thread */
if (skip_report(value_change, other_frame))
-   return false;
+   return;
}
 
if (rate_limit_report(this_frame, other_frame))
-   return false;
+   return;
 
/* Print report header. */

pr_err("==\n");
@@ -431,7 +428,8 @@ static bool print_report(enum kcsan_value_change 
value_change,
dump_stack_print_info(KERN_DEFAULT);

pr_err("==\n");
 
-   return true;
+   if (panic_on_warn)
+   panic("panic_on_warn set ...\n");
 }
 
 static void release_report(unsigned long *flags, struct other_info *other_info)
@@ -628,11 +626,8 @@ static void kcsan_report(const volatile void *ptr, size_t 
size, int access_type,
 * either TRUE or MAYBE. In case of MAYBE, further filtering may
 * be done once we know the full stack trace in print_report().
 */
-   bool reported = value_change != KCSAN_VALUE_CHANGE_FALSE &&
-   print_report(value_change, type, , 
other_info);
-
-   if (reported && panic_on_warn)
-   panic("panic_on_warn set ...\n");
+   if (value_change != KCSAN_VALUE_CHANGE_FALSE)
+   print_report(value_change, type, , other_info);
 
release_report(, other_info);
}
-- 
2.31.1.295.g9ea45b61b8-goog



[PATCH 2/9] kcsan: Distinguish kcsan_report() calls

2021-04-14 Thread Marco Elver
From: Mark Rutland 

Currently kcsan_report() is used to handle three distinct cases:

* The caller hit a watchpoint when attempting an access. Some
  information regarding the caller and access are recorded, but no
  output is produced.

* A caller which previously setup a watchpoint detected that the
  watchpoint has been hit, and possibly detected a change to the
  location in memory being watched. This may result in output reporting
  the interaction between this caller and the caller which hit the
  watchpoint.

* A caller detected a change to a modification to a memory location
  which wasn't detected by a watchpoint, for which there is no
  information on the other thread. This may result in output reporting
  the unexpected change.

... depending on the specific case the caller has distinct pieces of
information available, but the prototype of kcsan_report() has to handle
all three cases. This means that in some cases we pass redundant
information, and in others we don't pass all the information we could
pass. This also means that the report code has to demux these three
cases.

So that we can pass some additional information while also simplifying
the callers and report code, add separate kcsan_report_*() functions for
the distinct cases, updating callers accordingly. As the watchpoint_idx
is unused in the case of kcsan_report_unknown_origin(), this passes a
dummy value into kcsan_report(). Subsequent patches will refactor the
report code to avoid this.

There should be no functional change as a result of this patch.

Signed-off-by: Mark Rutland 
[ el...@google.com: try to make kcsan_report_*() names more descriptive ]
Signed-off-by: Marco Elver 
---
 kernel/kcsan/core.c   | 12 
 kernel/kcsan/kcsan.h  | 10 ++
 kernel/kcsan/report.c | 26 +++---
 3 files changed, 33 insertions(+), 15 deletions(-)

diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c
index d360183002d6..6fe1513e1e6a 100644
--- a/kernel/kcsan/core.c
+++ b/kernel/kcsan/core.c
@@ -380,9 +380,7 @@ static noinline void kcsan_found_watchpoint(const volatile 
void *ptr,
 
if (consumed) {
kcsan_save_irqtrace(current);
-   kcsan_report(ptr, size, type, KCSAN_VALUE_CHANGE_MAYBE,
-KCSAN_REPORT_CONSUMED_WATCHPOINT,
-watchpoint - watchpoints);
+   kcsan_report_set_info(ptr, size, type, watchpoint - 
watchpoints);
kcsan_restore_irqtrace(current);
} else {
/*
@@ -558,8 +556,8 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t 
size, int type)
if (is_assert && value_change == KCSAN_VALUE_CHANGE_TRUE)

atomic_long_inc(_counters[KCSAN_COUNTER_ASSERT_FAILURES]);
 
-   kcsan_report(ptr, size, type, value_change, 
KCSAN_REPORT_RACE_SIGNAL,
-watchpoint - watchpoints);
+   kcsan_report_known_origin(ptr, size, type, value_change,
+ watchpoint - watchpoints);
} else if (value_change == KCSAN_VALUE_CHANGE_TRUE) {
/* Inferring a race, since the value should not have changed. */
 
@@ -568,9 +566,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t 
size, int type)

atomic_long_inc(_counters[KCSAN_COUNTER_ASSERT_FAILURES]);
 
if (IS_ENABLED(CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN) || 
is_assert)
-   kcsan_report(ptr, size, type, KCSAN_VALUE_CHANGE_TRUE,
-KCSAN_REPORT_RACE_UNKNOWN_ORIGIN,
-watchpoint - watchpoints);
+   kcsan_report_unknown_origin(ptr, size, type);
}
 
/*
diff --git a/kernel/kcsan/kcsan.h b/kernel/kcsan/kcsan.h
index 9881099d4179..2ee43fd5d6a4 100644
--- a/kernel/kcsan/kcsan.h
+++ b/kernel/kcsan/kcsan.h
@@ -136,10 +136,12 @@ enum kcsan_report_type {
 };
 
 /*
- * Print a race report from thread that encountered the race.
+ * Notify the report code that a race occurred.
  */
-extern void kcsan_report(const volatile void *ptr, size_t size, int 
access_type,
-enum kcsan_value_change value_change,
-enum kcsan_report_type type, int watchpoint_idx);
+void kcsan_report_set_info(const volatile void *ptr, size_t size, int 
access_type,
+  int watchpoint_idx);
+void kcsan_report_known_origin(const volatile void *ptr, size_t size, int 
access_type,
+  enum kcsan_value_change value_change, int 
watchpoint_idx);
+void kcsan_report_unknown_origin(const volatile void *ptr, size_t size, int 
access_type);
 
 #endif /* _KERNEL_KCSAN_KCSAN_H */
diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c
index 13dce3c664d6..5232bf218ea7 100644
--- a/kernel/kcsan/report.c
+++ b/kernel/kcsan/report.c
@@ -598,9 +598,9 @@ static noinl

[PATCH 1/9] kcsan: Simplify value change detection

2021-04-14 Thread Marco Elver
From: Mark Rutland 

In kcsan_setup_watchpoint() we store snapshots of a watched value into a
union of u8/u16/u32/u64 sized fields, modify this in place using a
consistent field, then later check for any changes via the u64 field.

We can achieve the safe effect more simply by always treating the field
as a u64, as smaller values will be zero-extended. As the values are
zero-extended, we don't need to truncate the access_mask when we apply
it, and can always apply the full 64-bit access_mask to the 64-bit
value.

Finally, we can store the two snapshots and calculated difference
separately, which makes the code a little easier to read, and will
permit reporting the old/new values in subsequent patches.

There should be no functional change as a result of this patch.

Signed-off-by: Mark Rutland 
Signed-off-by: Marco Elver 
---
 kernel/kcsan/core.c | 40 
 1 file changed, 16 insertions(+), 24 deletions(-)

diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c
index 45c821d4e8bd..d360183002d6 100644
--- a/kernel/kcsan/core.c
+++ b/kernel/kcsan/core.c
@@ -407,12 +407,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t 
size, int type)
const bool is_write = (type & KCSAN_ACCESS_WRITE) != 0;
const bool is_assert = (type & KCSAN_ACCESS_ASSERT) != 0;
atomic_long_t *watchpoint;
-   union {
-   u8 _1;
-   u16 _2;
-   u32 _4;
-   u64 _8;
-   } expect_value;
+   u64 old, new, diff;
unsigned long access_mask;
enum kcsan_value_change value_change = KCSAN_VALUE_CHANGE_MAYBE;
unsigned long ua_flags = user_access_save();
@@ -468,19 +463,19 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t 
size, int type)
 * Read the current value, to later check and infer a race if the data
 * was modified via a non-instrumented access, e.g. from a device.
 */
-   expect_value._8 = 0;
+   old = 0;
switch (size) {
case 1:
-   expect_value._1 = READ_ONCE(*(const u8 *)ptr);
+   old = READ_ONCE(*(const u8 *)ptr);
break;
case 2:
-   expect_value._2 = READ_ONCE(*(const u16 *)ptr);
+   old = READ_ONCE(*(const u16 *)ptr);
break;
case 4:
-   expect_value._4 = READ_ONCE(*(const u32 *)ptr);
+   old = READ_ONCE(*(const u32 *)ptr);
break;
case 8:
-   expect_value._8 = READ_ONCE(*(const u64 *)ptr);
+   old = READ_ONCE(*(const u64 *)ptr);
break;
default:
break; /* ignore; we do not diff the values */
@@ -506,33 +501,30 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t 
size, int type)
 * racy access.
 */
access_mask = get_ctx()->access_mask;
+   new = 0;
switch (size) {
case 1:
-   expect_value._1 ^= READ_ONCE(*(const u8 *)ptr);
-   if (access_mask)
-   expect_value._1 &= (u8)access_mask;
+   new = READ_ONCE(*(const u8 *)ptr);
break;
case 2:
-   expect_value._2 ^= READ_ONCE(*(const u16 *)ptr);
-   if (access_mask)
-   expect_value._2 &= (u16)access_mask;
+   new = READ_ONCE(*(const u16 *)ptr);
break;
case 4:
-   expect_value._4 ^= READ_ONCE(*(const u32 *)ptr);
-   if (access_mask)
-   expect_value._4 &= (u32)access_mask;
+   new = READ_ONCE(*(const u32 *)ptr);
break;
case 8:
-   expect_value._8 ^= READ_ONCE(*(const u64 *)ptr);
-   if (access_mask)
-   expect_value._8 &= (u64)access_mask;
+   new = READ_ONCE(*(const u64 *)ptr);
break;
default:
break; /* ignore; we do not diff the values */
}
 
+   diff = old ^ new;
+   if (access_mask)
+   diff &= access_mask;
+
/* Were we able to observe a value-change? */
-   if (expect_value._8 != 0)
+   if (diff != 0)
value_change = KCSAN_VALUE_CHANGE_TRUE;
 
/* Check if this access raced with another. */
-- 
2.31.1.295.g9ea45b61b8-goog



[PATCH 3/9] kcsan: Refactor passing watchpoint/other_info

2021-04-14 Thread Marco Elver
From: Mark Rutland 

The `watchpoint_idx` argument to kcsan_report() isn't meaningful for
races which were not detected by a watchpoint, and it would be clearer
if callers passed the other_info directly so that a NULL value can be
passed in this case.

Given that callers manipulate their watchpoints before passing the index
into kcsan_report_*(), and given we index the `other_infos` array using
this before we sanity-check it, the subsequent sanity check isn't all
that useful.

Let's remove the `watchpoint_idx` sanity check, and move the job of
finding the `other_info` out of kcsan_report().

Other than the removal of the check, there should be no functional
change as a result of this patch.

Signed-off-by: Mark Rutland 
Signed-off-by: Marco Elver 
---
 kernel/kcsan/report.c | 13 -
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c
index 5232bf218ea7..88225f6d471e 100644
--- a/kernel/kcsan/report.c
+++ b/kernel/kcsan/report.c
@@ -600,7 +600,7 @@ static noinline bool prepare_report(unsigned long *flags,
 
 static void kcsan_report(const volatile void *ptr, size_t size, int 
access_type,
 enum kcsan_value_change value_change,
-enum kcsan_report_type type, int watchpoint_idx)
+enum kcsan_report_type type, struct other_info 
*other_info)
 {
unsigned long flags = 0;
const struct access_info ai = {
@@ -610,12 +610,8 @@ static void kcsan_report(const volatile void *ptr, size_t 
size, int access_type,
.task_pid   = in_task() ? task_pid_nr(current) : -1,
.cpu_id = raw_smp_processor_id()
};
-   struct other_info *other_info = type == KCSAN_REPORT_RACE_UNKNOWN_ORIGIN
-   ? NULL : _infos[watchpoint_idx];
 
kcsan_disable_current();
-   if (WARN_ON(watchpoint_idx < 0 || watchpoint_idx >= 
ARRAY_SIZE(other_infos)))
-   goto out;
 
/*
 * Because we may generate reports when we're in scheduler code, the use
@@ -642,7 +638,6 @@ static void kcsan_report(const volatile void *ptr, size_t 
size, int access_type,
}
 
lockdep_on();
-out:
kcsan_enable_current();
 }
 
@@ -650,18 +645,18 @@ void kcsan_report_set_info(const volatile void *ptr, 
size_t size, int access_typ
   int watchpoint_idx)
 {
kcsan_report(ptr, size, access_type, KCSAN_VALUE_CHANGE_MAYBE,
-KCSAN_REPORT_CONSUMED_WATCHPOINT, watchpoint_idx);
+KCSAN_REPORT_CONSUMED_WATCHPOINT, 
_infos[watchpoint_idx]);
 }
 
 void kcsan_report_known_origin(const volatile void *ptr, size_t size, int 
access_type,
   enum kcsan_value_change value_change, int 
watchpoint_idx)
 {
kcsan_report(ptr, size, access_type, value_change,
-KCSAN_REPORT_RACE_SIGNAL, watchpoint_idx);
+KCSAN_REPORT_RACE_SIGNAL, _infos[watchpoint_idx]);
 }
 
 void kcsan_report_unknown_origin(const volatile void *ptr, size_t size, int 
access_type)
 {
kcsan_report(ptr, size, access_type, KCSAN_VALUE_CHANGE_TRUE,
-KCSAN_REPORT_RACE_UNKNOWN_ORIGIN, 0);
+KCSAN_REPORT_RACE_UNKNOWN_ORIGIN, NULL);
 }
-- 
2.31.1.295.g9ea45b61b8-goog



[PATCH 6/9] kcsan: Remove reporting indirection

2021-04-14 Thread Marco Elver
From: Mark Rutland 

Now that we have separate kcsan_report_*() functions, we can factor the
distinct logic for each of the report cases out of kcsan_report(). While
this means each case has to handle mutual exclusion independently, this
minimizes the conditionality of code and makes it easier to read, and
will permit passing distinct bits of information to print_report() in
future.

There should be no functional change as a result of this patch.

Signed-off-by: Mark Rutland 
[ el...@google.com: retain comment about lockdep_off() ]
Signed-off-by: Marco Elver 
---
 kernel/kcsan/report.c | 115 ++
 1 file changed, 49 insertions(+), 66 deletions(-)

diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c
index d8441bed065c..ba924f110c95 100644
--- a/kernel/kcsan/report.c
+++ b/kernel/kcsan/report.c
@@ -434,13 +434,11 @@ static void print_report(enum kcsan_value_change 
value_change,
 
 static void release_report(unsigned long *flags, struct other_info *other_info)
 {
-   if (other_info)
-   /*
-* Use size to denote valid/invalid, since KCSAN entirely
-* ignores 0-sized accesses.
-*/
-   other_info->ai.size = 0;
-
+   /*
+* Use size to denote valid/invalid, since KCSAN entirely ignores
+* 0-sized accesses.
+*/
+   other_info->ai.size = 0;
raw_spin_unlock_irqrestore(_lock, *flags);
 }
 
@@ -573,61 +571,6 @@ static bool prepare_report_consumer(unsigned long *flags,
return false;
 }
 
-/*
- * Depending on the report type either sets @other_info and returns false, or
- * awaits @other_info and returns true. If @other_info is not required for the
- * report type, simply acquires @report_lock and returns true.
- */
-static noinline bool prepare_report(unsigned long *flags,
-   enum kcsan_report_type type,
-   const struct access_info *ai,
-   struct other_info *other_info)
-{
-   switch (type) {
-   case KCSAN_REPORT_CONSUMED_WATCHPOINT:
-   prepare_report_producer(flags, ai, other_info);
-   return false;
-   case KCSAN_REPORT_RACE_SIGNAL:
-   return prepare_report_consumer(flags, ai, other_info);
-   default:
-   /* @other_info not required; just acquire @report_lock. */
-   raw_spin_lock_irqsave(_lock, *flags);
-   return true;
-   }
-}
-
-static void kcsan_report(const struct access_info *ai, enum kcsan_value_change 
value_change,
-enum kcsan_report_type type, struct other_info 
*other_info)
-{
-   unsigned long flags = 0;
-
-   kcsan_disable_current();
-
-   /*
-* Because we may generate reports when we're in scheduler code, the use
-* of printk() could deadlock. Until such time that all printing code
-* called in print_report() is scheduler-safe, accept the risk, and just
-* get our message out. As such, also disable lockdep to hide the
-* warning, and avoid disabling lockdep for the rest of the kernel.
-*/
-   lockdep_off();
-
-   if (prepare_report(, type, ai, other_info)) {
-   /*
-* Never report if value_change is FALSE, only if we it is
-* either TRUE or MAYBE. In case of MAYBE, further filtering may
-* be done once we know the full stack trace in print_report().
-*/
-   if (value_change != KCSAN_VALUE_CHANGE_FALSE)
-   print_report(value_change, type, ai, other_info);
-
-   release_report(, other_info);
-   }
-
-   lockdep_on();
-   kcsan_enable_current();
-}
-
 static struct access_info prepare_access_info(const volatile void *ptr, size_t 
size,
  int access_type)
 {
@@ -644,22 +587,62 @@ void kcsan_report_set_info(const volatile void *ptr, 
size_t size, int access_typ
   int watchpoint_idx)
 {
const struct access_info ai = prepare_access_info(ptr, size, 
access_type);
+   unsigned long flags;
+
+   kcsan_disable_current();
+   lockdep_off(); /* See kcsan_report_known_origin(). */
 
-   kcsan_report(, KCSAN_VALUE_CHANGE_MAYBE, 
KCSAN_REPORT_CONSUMED_WATCHPOINT,
-_infos[watchpoint_idx]);
+   prepare_report_producer(, , _infos[watchpoint_idx]);
+
+   lockdep_on();
+   kcsan_enable_current();
 }
 
 void kcsan_report_known_origin(const volatile void *ptr, size_t size, int 
access_type,
   enum kcsan_value_change value_change, int 
watchpoint_idx)
 {
const struct access_info ai = prepare_access_info(ptr, size, 
access_type);
+   struct other_info *other_info = _infos[watchpoint_idx];
+   unsigned long flags = 0;
 
-   kcsan_report(, value_change, KCSAN_REPORT_RACE_

[PATCH 5/9] kcsan: Refactor access_info initialization

2021-04-14 Thread Marco Elver
From: Mark Rutland 

In subsequent patches we'll want to split kcsan_report() into distinct
handlers for each report type. The largest bit of common work is
initializing the `access_info`, so let's factor this out into a helper,
and have the kcsan_report_*() functions pass the `aaccess_info` as a
parameter to kcsan_report().

There should be no functional change as a result of this patch.

Signed-off-by: Mark Rutland 
Signed-off-by: Marco Elver 
---
 kernel/kcsan/report.c | 42 +-
 1 file changed, 25 insertions(+), 17 deletions(-)

diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c
index 8bfa970965a1..d8441bed065c 100644
--- a/kernel/kcsan/report.c
+++ b/kernel/kcsan/report.c
@@ -596,18 +596,10 @@ static noinline bool prepare_report(unsigned long *flags,
}
 }
 
-static void kcsan_report(const volatile void *ptr, size_t size, int 
access_type,
-enum kcsan_value_change value_change,
+static void kcsan_report(const struct access_info *ai, enum kcsan_value_change 
value_change,
 enum kcsan_report_type type, struct other_info 
*other_info)
 {
unsigned long flags = 0;
-   const struct access_info ai = {
-   .ptr= ptr,
-   .size   = size,
-   .access_type= access_type,
-   .task_pid   = in_task() ? task_pid_nr(current) : -1,
-   .cpu_id = raw_smp_processor_id()
-   };
 
kcsan_disable_current();
 
@@ -620,14 +612,14 @@ static void kcsan_report(const volatile void *ptr, size_t 
size, int access_type,
 */
lockdep_off();
 
-   if (prepare_report(, type, , other_info)) {
+   if (prepare_report(, type, ai, other_info)) {
/*
 * Never report if value_change is FALSE, only if we it is
 * either TRUE or MAYBE. In case of MAYBE, further filtering may
 * be done once we know the full stack trace in print_report().
 */
if (value_change != KCSAN_VALUE_CHANGE_FALSE)
-   print_report(value_change, type, , other_info);
+   print_report(value_change, type, ai, other_info);
 
release_report(, other_info);
}
@@ -636,22 +628,38 @@ static void kcsan_report(const volatile void *ptr, size_t 
size, int access_type,
kcsan_enable_current();
 }
 
+static struct access_info prepare_access_info(const volatile void *ptr, size_t 
size,
+ int access_type)
+{
+   return (struct access_info) {
+   .ptr= ptr,
+   .size   = size,
+   .access_type= access_type,
+   .task_pid   = in_task() ? task_pid_nr(current) : -1,
+   .cpu_id = raw_smp_processor_id()
+   };
+}
+
 void kcsan_report_set_info(const volatile void *ptr, size_t size, int 
access_type,
   int watchpoint_idx)
 {
-   kcsan_report(ptr, size, access_type, KCSAN_VALUE_CHANGE_MAYBE,
-KCSAN_REPORT_CONSUMED_WATCHPOINT, 
_infos[watchpoint_idx]);
+   const struct access_info ai = prepare_access_info(ptr, size, 
access_type);
+
+   kcsan_report(, KCSAN_VALUE_CHANGE_MAYBE, 
KCSAN_REPORT_CONSUMED_WATCHPOINT,
+_infos[watchpoint_idx]);
 }
 
 void kcsan_report_known_origin(const volatile void *ptr, size_t size, int 
access_type,
   enum kcsan_value_change value_change, int 
watchpoint_idx)
 {
-   kcsan_report(ptr, size, access_type, value_change,
-KCSAN_REPORT_RACE_SIGNAL, _infos[watchpoint_idx]);
+   const struct access_info ai = prepare_access_info(ptr, size, 
access_type);
+
+   kcsan_report(, value_change, KCSAN_REPORT_RACE_SIGNAL, 
_infos[watchpoint_idx]);
 }
 
 void kcsan_report_unknown_origin(const volatile void *ptr, size_t size, int 
access_type)
 {
-   kcsan_report(ptr, size, access_type, KCSAN_VALUE_CHANGE_TRUE,
-KCSAN_REPORT_RACE_UNKNOWN_ORIGIN, NULL);
+   const struct access_info ai = prepare_access_info(ptr, size, 
access_type);
+
+   kcsan_report(, KCSAN_VALUE_CHANGE_TRUE, 
KCSAN_REPORT_RACE_UNKNOWN_ORIGIN, NULL);
 }
-- 
2.31.1.295.g9ea45b61b8-goog



[PATCH 0/9] kcsan: Add support for reporting observed value changes

2021-04-14 Thread Marco Elver
This series adds support for showing observed value changes in reports.
Several clean up and refactors of KCSAN reporting code are done as a
pre-requisite. An example of the new KCSAN reports:

==
BUG: KCSAN: data-race in test_kernel_read / test_kernel_write

write to 0xc009a628 of 8 bytes by task 487 on cpu 0:
 test_kernel_write+0x1d/0x30
 access_thread+0x89/0xd0
 kthread+0x23e/0x260
 ret_from_fork+0x22/0x30

read to 0xc009a628 of 8 bytes by task 488 on cpu 6:
 test_kernel_read+0x10/0x20
 access_thread+0x89/0xd0
 kthread+0x23e/0x260
 ret_from_fork+0x22/0x30

value changed: 0x09a6 -> 0x09b2

Reported by Kernel Concurrency Sanitizer on:
CPU: 6 PID: 488 Comm: access_thread Not tainted 5.12.0-rc2+ #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 
04/01/2014
==

On one hand this will help better understand "race of unknown origin"
(one stack trace only) reports, but also provides more information to
better understand normal data race reports like above where KCSAN also
detected a value change.

Changelog
-

This series was originally prepared courtesy of Mark Rutland in
September 2020. Because KCSAN had a few minor changes since the original
draft of the series, it required a rebase and re-test. To not be
forgotten and get these changes in sooner than later, Mark kindly agreed
to me adopting the series and doing the rebase, a few minor tweaks, and
finally re-test.

Marco Elver (1):
  kcsan: Document "value changed" line

Mark Rutland (8):
  kcsan: Simplify value change detection
  kcsan: Distinguish kcsan_report() calls
  kcsan: Refactor passing watchpoint/other_info
  kcsan: Fold panic() call into print_report()
  kcsan: Refactor access_info initialization
  kcsan: Remove reporting indirection
  kcsan: Remove kcsan_report_type
  kcsan: Report observed value changes

 Documentation/dev-tools/kcsan.rst |  88 +++-
 kernel/kcsan/core.c   |  53 --
 kernel/kcsan/kcsan.h  |  39 ---
 kernel/kcsan/report.c | 169 --
 4 files changed, 162 insertions(+), 187 deletions(-)

-- 
2.31.1.295.g9ea45b61b8-goog



Re: [PATCH v2] Documentation: dev-tools: Add Testing Overview

2021-04-14 Thread Marco Elver
On Wed, 14 Apr 2021 at 10:15, David Gow  wrote:
>
> The kernel now has a number of testing and debugging tools, and we've
> seen a bit of confusion about what the differences between them are.
>
> Add a basic documentation outlining the testing tools, when to use each,
> and how they interact.
>
> This is a pretty quick overview rather than the idealised "kernel
> testing guide" that'd probably be optimal, but given the number of times
> questions like "When do you use KUnit and when do you use Kselftest?"
> are being asked, it seemed worth at least having something. Hopefully
> this can form the basis for more detailed documentation later.
>
> Signed-off-by: David Gow 

Reviewed-by: Marco Elver 

Looks good to me, thanks. I think one can write whole articles (or
even books) about this topic, so it's easy to forget this is a quick
overview, and keep on adding.

> ---
> Thanks, everyone, for the comments on the doc. I've made a few of the
> suggested changes. Please let me know what you think!
>
> -- David
>
> Changes since v1:
> https://lore.kernel.org/linux-kselftest/20210410070529.4113432-1-david...@google.com/
> - Note KUnit's speed and that one should provide selftests for syscalls
> - Mention lockdep as a Dynamic Analysis Tool
> - Refer to "Dynamic Analysis Tools" instead of "Sanitizers"
> - A number of minor formatting tweaks and rewordings for clarity.
>
> Not changed:
> - I haven't included an exhaustive list of differences, advantages, etc,
>   between KUnit and kselftest: for now, the doc continues to focus on
>   the difference between 'in-kernel' and 'userspace' testing here.
> - Similarly, I'm not linking out to docs defining and describing "Unit"
>   tests versus "End-to-end" tests. None of the existing documentation
>   elsewhere quite matches what we do in the kernel perfectly, so it
>   seems less confusing to focus on the 'in-kernel'/'userspace'
>   distinction, and leave other definitions as a passing mention for
>   those who are already familiar with the concepts.
> - I haven't linked to any talk videos here: a few of them are linked on
>   (e.g.) the KUnit webpage, but I wanted to keep the Kernel documentation
>   more self-contained for now. No objection to adding them in a follow-up
>   patch if people feel strongly about it, though.
> - The link from index.rst to this doc is unchanged. I personally think
>   that the link is prominent enough there: it's the first link, and
>   shows up a few times. One possibility if people disagreed would be to
>   merge this page with the index, but given not all dev-tools are going
>   to be testing-related, it seemed a bit arrogant. :-)
>
>  Documentation/dev-tools/index.rst|   3 +
>  Documentation/dev-tools/testing-overview.rst | 117 +++
>  2 files changed, 120 insertions(+)
>  create mode 100644 Documentation/dev-tools/testing-overview.rst
>
> diff --git a/Documentation/dev-tools/index.rst 
> b/Documentation/dev-tools/index.rst
> index 1b1cf4f5c9d9..f590e5860794 100644
> --- a/Documentation/dev-tools/index.rst
> +++ b/Documentation/dev-tools/index.rst
> @@ -7,6 +7,8 @@ be used to work on the kernel. For now, the documents have 
> been pulled
>  together without any significant effort to integrate them into a coherent
>  whole; patches welcome!
>
> +A brief overview of testing-specific tools can be found in 
> :doc:`testing-overview`.
> +
>  .. class:: toc-title
>
>Table of contents
> @@ -14,6 +16,7 @@ whole; patches welcome!
>  .. toctree::
> :maxdepth: 2
>
> +   testing-overview
> coccinelle
> sparse
> kcov
> diff --git a/Documentation/dev-tools/testing-overview.rst 
> b/Documentation/dev-tools/testing-overview.rst
> new file mode 100644
> index ..ce36a8cdf6b5
> --- /dev/null
> +++ b/Documentation/dev-tools/testing-overview.rst
> @@ -0,0 +1,117 @@
> +.. SPDX-License-Identifier: GPL-2.0
> +
> +
> +Kernel Testing Guide
> +
> +
> +
> +There are a number of different tools for testing the Linux kernel, so 
> knowing
> +when to use each of them can be a challenge. This document provides a rough
> +overview of their differences, and how they fit together.
> +
> +
> +Writing and Running Tests
> +=
> +
> +The bulk of kernel tests are written using either the kselftest or KUnit
> +frameworks. These both provide infrastructure to help make running tests and
> +groups of tests easier, as well as providing helpers to aid in writing new
> +tests.
> +
> +If you're looking to verify the behaviour of the Kernel — particularly 
> specific
> +parts of t

Re: [PATCH v4 2/3] mm/slub, kunit: add a KUnit test for SLUB debugging functionality

2021-04-13 Thread Marco Elver
On Tue, 13 Apr 2021 at 12:07,  wrote:
> From: Oliver Glitta 
>
> SLUB has resiliency_test() function which is hidden behind #ifdef
> SLUB_RESILIENCY_TEST that is not part of Kconfig, so nobody
> runs it. KUnit should be a proper replacement for it.
>
> Try changing byte in redzone after allocation and changing
> pointer to next free node, first byte, 50th byte and redzone
> byte. Check if validation finds errors.
>
> There are several differences from the original resiliency test:
> Tests create own caches with known state instead of corrupting
> shared kmalloc caches.
>
> The corruption of freepointer uses correct offset, the original
> resiliency test got broken with freepointer changes.
>
> Scratch changing random byte test, because it does not have
> meaning in this form where we need deterministic results.
>
> Add new option CONFIG_SLUB_KUNIT_TEST in Kconfig.
> Because the test deliberatly modifies non-allocated objects, it depends on
> !KASAN which would have otherwise prevented that.

Hmm, did the test fail with KASAN? Is it possible to skip the tests
and still run a subset of tests with KASAN? It'd be nice if we could
run some of these tests with KASAN as well.

> Use kunit_resource to count errors in cache and silence bug reports.
> Count error whenever slab_bug() or slab_fix() is called or when
> the count of pages is wrong.
>
> Signed-off-by: Oliver Glitta 

Reviewed-by: Marco Elver 

Thanks, this all looks good to me. But perhaps do test what works with
KASAN, to see if you need the !KASAN constraint for all cases.

> ---
> Changes since v3
>
> Use kunit_resource to silence bug reports and count errors suggested by
> Marco Elver.
> Make the test depends on !KASAN thanks to report from the kernel test robot.
>
> Changes since v2
>
> Use bit operation & instead of logical && as reported by kernel test
> robot and Dan Carpenter
>
> Changes since v1
>
> Conversion from kselftest to KUnit test suggested by Marco Elver.
> Error silencing.
> Error counting improvements.
>  lib/Kconfig.debug |  12 
>  lib/Makefile  |   1 +
>  lib/slub_kunit.c  | 150 ++
>  mm/slab.h |   1 +
>  mm/slub.c |  50 ++--
>  5 files changed, 209 insertions(+), 5 deletions(-)
>  create mode 100644 lib/slub_kunit.c
>
> diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
> index 2779c29d9981..9b8a0d754278 100644
> --- a/lib/Kconfig.debug
> +++ b/lib/Kconfig.debug
> @@ -2371,6 +2371,18 @@ config BITS_TEST
>
>   If unsure, say N.
>
> +config SLUB_KUNIT_TEST
> +   tristate "KUnit test for SLUB cache error detection" if 
> !KUNIT_ALL_TESTS
> +   depends on SLUB_DEBUG && KUNIT && !KASAN
> +   default KUNIT_ALL_TESTS
> +   help
> + This builds SLUB allocator unit test.
> + Tests SLUB cache debugging functionality.
> + For more information on KUnit and unit tests in general please refer
> + to the KUnit documentation in Documentation/dev-tools/kunit/.
> +
> + If unsure, say N.
> +
>  config TEST_UDELAY
> tristate "udelay test driver"
> help
> diff --git a/lib/Makefile b/lib/Makefile
> index b5307d3eec1a..1e59c6714ed8 100644
> --- a/lib/Makefile
> +++ b/lib/Makefile
> @@ -352,5 +352,6 @@ obj-$(CONFIG_LIST_KUNIT_TEST) += list-test.o
>  obj-$(CONFIG_LINEAR_RANGES_TEST) += test_linear_ranges.o
>  obj-$(CONFIG_BITS_TEST) += test_bits.o
>  obj-$(CONFIG_CMDLINE_KUNIT_TEST) += cmdline_kunit.o
> +obj-$(CONFIG_SLUB_KUNIT_TEST) += slub_kunit.o
>
>  obj-$(CONFIG_GENERIC_LIB_DEVMEM_IS_ALLOWED) += devmem_is_allowed.o
> diff --git a/lib/slub_kunit.c b/lib/slub_kunit.c
> new file mode 100644
> index ..cb9ae9f7e8a6
> --- /dev/null
> +++ b/lib/slub_kunit.c
> @@ -0,0 +1,150 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include "../mm/slab.h"
> +
> +static struct kunit_resource resource;
> +static int slab_errors;
> +
> +static void test_clobber_zone(struct kunit *test)
> +{
> +   struct kmem_cache *s = kmem_cache_create("TestSlub_RZ_alloc", 64, 0,
> +   SLAB_RED_ZONE, NULL);
> +   u8 *p = kmem_cache_alloc(s, GFP_KERNEL);
> +
> +   p[64] = 0x12;
> +
> +   validate_slab_cache(s);
> +   KUNIT_EXPECT_EQ(test, 2, slab_errors);
> +
> +   kmem_cache_free(s, p);
> +   kmem_cache_destroy(s);
> +}
> +
> +static void test_next_pointer(struct kunit *test)
> +{
> +   struct kmem_cache *s = kmem_cache_create("TestSlub_next_ptr_free", 
&

Re: KCSAN: data-race in __jbd2_journal_file_buffer / jbd2_journal_dirty_metadata

2021-04-12 Thread Marco Elver
On Tue, Apr 06, 2021 at 11:01AM -0400, Theodore Ts'o wrote:
> On Tue, Apr 06, 2021 at 02:32:33PM +0200, Jan Kara wrote:
> > And the comment explains, why we do this unreliable check. Again, if we
> > wanted to silence KCSAN, we could use data_race() macro but AFAIU Ted isn't
> > very fond of that annotation.
> 
> I'm not fond of the data_race macro, but I like bogus KCSAN reports
> even less.  My main complaint is if we're going to have to put the
> data_race() macro in place, we're going to need to annotate each
> location with an explanation of why it's there (suppress a KCSAN false
> positive), and why's it's safe.  If it's only one or two places, it'll
> probably be fine.  If it's dozens, then I would say that KCSAN is
> becoming a net negative in terms of making the Linux kernel code
> maintainable.

I've just seen the latest reports on these data races [1], but it seems
the more relevant context is here.
[1] https://lore.kernel.org/linux-ext4/20210412113158.ga4...@quack2.suse.cz/

Let me try to put things in perspective.

No, we do not want maintainability to suffer. Whether or not documenting
the concurrency design via data_race() and a few comments is a negative
or positive is up to you. To me, it'd be a positive because I don't have
to guess what the code is trying to do because concurrent code rarely is
obvious. (In fairness, if you don't like to add comments, just a
data_race() without comment tells a reader more than now; perhaps they'd
then rummage in the git logs.)

Yes, there are currently lots of data-racy accesses in the kernel that
are mostly benign. Yet, they are data races in the memory model's eyes,
and every optimizing compiler is free to screw them up! For example a
lot of those plain read-modify-write bitops ("...  |= ...").

Unfortunately tooling cannot determine without hints (like data_race())
whether or not those are safe, since the programmer's intent is unclear.
Crucially, the programmer's intent is also unclear to the compiler!
Which means the compiler _is_ free to screw up those operations.

If we could somehow precisely determine which plain accesses can race,
we'd solve a decades-old problem: optimizing compilers and concurrent
code do not get along. Therefore, C needed a memory model to sort out
this mess, which we have since C11. The Linux kernel, however, doesn't
play by those rules. The Linux Kernel Memory Model (LKMM) tries to
specify the rules the kernel can safely play by.

But since we have KCSAN, which initially tried to follow the LKMM
strictly, various feedback has resulted in taming KCSAN to a subset of
the LKMM. A lot of the data races that are left, yet appear benign,
simply have no obvious rules or patterns (otherwise we wouldn't have the
problem we have with optimizing compilers). I couldn't, in good
conscience, tame KCSAN based on poorly thought-out rules. Because we
know they're data races, and the compiler _is_ free to subject them to
concurrency-unsafe optimizations.

Because we knew that different codes will want different KCSAN exposure
until there is a de-facto LKMM that is to be followed everywhere (one
can dream), KCSAN has lots of knobs. They are described in detail here:
https://lwn.net/Articles/816854/

> I'm not fond of the data_race macro, but I like bogus KCSAN reports
> even less.

While the data_race() macro was meant to be exactly for this case, to
tell tooling "this data race is fine, even if the compiler messes it
up", if there are too many data races for you right now feel free to add
'KCSAN_SANITIZE_file.o := n' to the files you don't want checked. Or
even 'KCSAN_SANITIZE := n' to ignore all files in a directory. It would
avoid the robots sending you reports. Not ideal, but it'd give some time
to see how things evolve elsewhere if you'd rather avoid all this for
now.

Thanks,
-- Marco


Re: [PATCH] Documentation: dev-tools: Add Testing Overview

2021-04-12 Thread Marco Elver
On Sat, 10 Apr 2021 at 13:53, Daniel Latypov  wrote:
> On Sat, Apr 10, 2021 at 12:05 AM David Gow  wrote:
[...]
> > +
> > +
> > +Sanitizers
> > +==
> > +

The "sanitizers" have originally been a group of tools that relied on
compiler instrumentation to perform various dynamic analysis
(initially ASan, TSan, MSan for user space). The term "sanitizer" has
since been broadened to include a few non-compiler based tools such as
GWP-ASan in user space, of which KFENCE is its kernel cousin but it
doesn't have "sanitizer" in its name (because we felt GWP-KASAN was
pushing it with the acronyms ;-)). Also, these days we have HW_TAGS
based KASAN, which doesn't rely on compiler instrumentation but
instead on MTE in Arm64.

Things like kmemleak have never really been called a sanitizer, but
they _are_ dynamic analysis tools.

So to avoid confusion, in particular avoid establishing "sanitizers"
to be synonymous with "dynamic analysis" ("all sanitizers are dynamic
analysis tools, but not all dynamic analysis tools are sanitizers"),
the section here should not be called "Sanitizers" but "Dynamic
Analysis Tools". We could have a subsection "Sanitizers", but I think
it's not necessary.

> > +The kernel also supports a number of sanitizers, which attempt to detect
> > +classes of issues when the occur in a running kernel. These typically
>
> *they occur
>
> > +look for undefined behaviour of some kind, such as invalid memory accesses,
> > +concurrency issues such as data races, or other undefined behaviour like
> > +integer overflows.
> > +
> > +* :doc:`kmemleak` (Kmemleak) detects possible memory leaks.
> > +* :doc:`kasan` detects invalid memory accesses such as out-of-bounds and
> > +  use-after-free errors.
> > +* :doc:`ubsan` detects behaviour that is undefined by the C standard, like
> > +  integer overflows.
> > +* :doc:`kcsan` detects data races.
> > +* :doc:`kfence` is a low-overhead detector of memory issues, which is much
> > +  faster than KASAN and can be used in production.
>
> Hmm, it lives elsewhere, but would also calling out lockdep here be useful?
> I've also not heard anyone call it a sanitizer before, but it fits the
> definition you've given.
>
> Now that I think about it, I've never looked for documentation on it,
> is this the best page?
> https://www.kernel.org/doc/html/latest/locking/lockdep-design.html

Not a "sanitizer" but our sanitizers are all dynamic analysis tools,
and lockdep is also a dynamic analysis tool.

If we want to be pedantic, the kernel has numerous options to add
"instrumentation" (compiler based or explicit) that will detect some
kind of error at runtime. Most of them live in lib/Kconfig.debug. I
think mentioning something like that is in scope of this document, but
we certainly can't mention all debug tools the kernel has to offer.
Mentioning the big ones like above and then referring to
lib/Kconfig.debug is probably fine.

Dmitry recently gave an excellent talk on some of this:
https://www.youtube.com/watch?v=ufcyOkgFZ2Q

Thanks,
-- Marco


[tip: locking/core] kcsan, debugfs: Move debugfs file creation out of early init

2021-04-11 Thread tip-bot2 for Marco Elver
The following commit has been merged into the locking/core branch of tip:

Commit-ID: e36299efe7d749976fbdaaf756dee6ef32543c2c
Gitweb:
https://git.kernel.org/tip/e36299efe7d749976fbdaaf756dee6ef32543c2c
Author:Marco Elver 
AuthorDate:Wed, 03 Mar 2021 10:38:45 +01:00
Committer: Paul E. McKenney 
CommitterDate: Mon, 08 Mar 2021 14:27:43 -08:00

kcsan, debugfs: Move debugfs file creation out of early init

Commit 56348560d495 ("debugfs: do not attempt to create a new file
before the filesystem is initalized") forbids creating new debugfs files
until debugfs is fully initialized.  This means that KCSAN's debugfs
file creation, which happened at the end of __init(), no longer works.
And was apparently never supposed to work!

However, there is no reason to create KCSAN's debugfs file so early.
This commit therefore moves its creation to a late_initcall() callback.

Cc: "Rafael J. Wysocki" 
Cc: stable 
Fixes: 56348560d495 ("debugfs: do not attempt to create a new file before the 
filesystem is initalized")
Reviewed-by: Greg Kroah-Hartman 
Signed-off-by: Marco Elver 
Signed-off-by: Paul E. McKenney 
---
 kernel/kcsan/core.c| 2 --
 kernel/kcsan/debugfs.c | 4 +++-
 kernel/kcsan/kcsan.h   | 5 -
 3 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c
index 3bf98db..23e7acb 100644
--- a/kernel/kcsan/core.c
+++ b/kernel/kcsan/core.c
@@ -639,8 +639,6 @@ void __init kcsan_init(void)
 
BUG_ON(!in_task());
 
-   kcsan_debugfs_init();
-
for_each_possible_cpu(cpu)
per_cpu(kcsan_rand_state, cpu) = (u32)get_cycles();
 
diff --git a/kernel/kcsan/debugfs.c b/kernel/kcsan/debugfs.c
index 3c8093a..209ad8d 100644
--- a/kernel/kcsan/debugfs.c
+++ b/kernel/kcsan/debugfs.c
@@ -261,7 +261,9 @@ static const struct file_operations debugfs_ops =
.release = single_release
 };
 
-void __init kcsan_debugfs_init(void)
+static void __init kcsan_debugfs_init(void)
 {
debugfs_create_file("kcsan", 0644, NULL, NULL, _ops);
 }
+
+late_initcall(kcsan_debugfs_init);
diff --git a/kernel/kcsan/kcsan.h b/kernel/kcsan/kcsan.h
index 8d4bf34..87ccdb3 100644
--- a/kernel/kcsan/kcsan.h
+++ b/kernel/kcsan/kcsan.h
@@ -31,11 +31,6 @@ void kcsan_save_irqtrace(struct task_struct *task);
 void kcsan_restore_irqtrace(struct task_struct *task);
 
 /*
- * Initialize debugfs file.
- */
-void kcsan_debugfs_init(void);
-
-/*
  * Statistics counters displayed via debugfs; should only be modified in
  * slow-paths.
  */


[tip: locking/core] kcsan: Make test follow KUnit style recommendations

2021-04-11 Thread tip-bot2 for Marco Elver
The following commit has been merged into the locking/core branch of tip:

Commit-ID: a146fed56f8a06a6f17ac11ebdc7ca3f396bcb55
Gitweb:
https://git.kernel.org/tip/a146fed56f8a06a6f17ac11ebdc7ca3f396bcb55
Author:Marco Elver 
AuthorDate:Wed, 13 Jan 2021 17:05:56 +01:00
Committer: Paul E. McKenney 
CommitterDate: Mon, 08 Mar 2021 14:27:43 -08:00

kcsan: Make test follow KUnit style recommendations

Per recently added KUnit style recommendations at
Documentation/dev-tools/kunit/style.rst, make the following changes to
the KCSAN test:

1. Rename 'kcsan-test.c' to 'kcsan_test.c'.

2. Rename suite name 'kcsan-test' to 'kcsan'.

3. Rename CONFIG_KCSAN_TEST to CONFIG_KCSAN_KUNIT_TEST and
   default to KUNIT_ALL_TESTS.

Reviewed-by: David Gow 
Signed-off-by: Marco Elver 
Signed-off-by: Paul E. McKenney 
---
 kernel/kcsan/Makefile |4 +-
 kernel/kcsan/kcsan-test.c | 1207 +
 kernel/kcsan/kcsan_test.c | 1207 -
 lib/Kconfig.kcsan |5 +-
 4 files changed, 1212 insertions(+), 1211 deletions(-)
 delete mode 100644 kernel/kcsan/kcsan-test.c
 create mode 100644 kernel/kcsan/kcsan_test.c

diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile
index 65ca553..c2bb07f 100644
--- a/kernel/kcsan/Makefile
+++ b/kernel/kcsan/Makefile
@@ -13,5 +13,5 @@ CFLAGS_core.o := $(call cc-option,-fno-conserve-stack) \
 obj-y := core.o debugfs.o report.o
 obj-$(CONFIG_KCSAN_SELFTEST) += selftest.o
 
-CFLAGS_kcsan-test.o := $(CFLAGS_KCSAN) -g -fno-omit-frame-pointer
-obj-$(CONFIG_KCSAN_TEST) += kcsan-test.o
+CFLAGS_kcsan_test.o := $(CFLAGS_KCSAN) -g -fno-omit-frame-pointer
+obj-$(CONFIG_KCSAN_KUNIT_TEST) += kcsan_test.o
diff --git a/kernel/kcsan/kcsan-test.c b/kernel/kcsan/kcsan-test.c
deleted file mode 100644
index ebe7fd2..000
--- a/kernel/kcsan/kcsan-test.c
+++ /dev/null
@@ -1,1207 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * KCSAN test with various race scenarious to test runtime behaviour. Since the
- * interface with which KCSAN's reports are obtained is via the console, this 
is
- * the output we should verify. For each test case checks the presence (or
- * absence) of generated reports. Relies on 'console' tracepoint to capture
- * reports as they appear in the kernel log.
- *
- * Makes use of KUnit for test organization, and the Torture framework for test
- * thread control.
- *
- * Copyright (C) 2020, Google LLC.
- * Author: Marco Elver 
- */
-
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-
-#ifdef CONFIG_CC_HAS_TSAN_COMPOUND_READ_BEFORE_WRITE
-#define __KCSAN_ACCESS_RW(alt) (KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE)
-#else
-#define __KCSAN_ACCESS_RW(alt) (alt)
-#endif
-
-/* Points to current test-case memory access "kernels". */
-static void (*access_kernels[2])(void);
-
-static struct task_struct **threads; /* Lists of threads. */
-static unsigned long end_time;   /* End time of test. */
-
-/* Report as observed from console. */
-static struct {
-   spinlock_t lock;
-   int nlines;
-   char lines[3][512];
-} observed = {
-   .lock = __SPIN_LOCK_UNLOCKED(observed.lock),
-};
-
-/* Setup test checking loop. */
-static __no_kcsan inline void
-begin_test_checks(void (*func1)(void), void (*func2)(void))
-{
-   kcsan_disable_current();
-
-   /*
-* Require at least as long as KCSAN_REPORT_ONCE_IN_MS, to ensure at
-* least one race is reported.
-*/
-   end_time = jiffies + msecs_to_jiffies(CONFIG_KCSAN_REPORT_ONCE_IN_MS + 
500);
-
-   /* Signal start; release potential initialization of shared data. */
-   smp_store_release(_kernels[0], func1);
-   smp_store_release(_kernels[1], func2);
-}
-
-/* End test checking loop. */
-static __no_kcsan inline bool
-end_test_checks(bool stop)
-{
-   if (!stop && time_before(jiffies, end_time)) {
-   /* Continue checking */
-   might_sleep();
-   return false;
-   }
-
-   kcsan_enable_current();
-   return true;
-}
-
-/*
- * Probe for console output: checks if a race was reported, and obtains 
observed
- * lines of interest.
- */
-__no_kcsan
-static void probe_console(void *ignore, const char *buf, size_t len)
-{
-   unsigned long flags;
-   int nlines;
-
-   /*
-* Note that KCSAN reports under a global lock, so we do not risk the
-* possibility of having multiple reports interleaved. If that were the
-* case, we'd expect tests to fail.
-*/
-
-   spin_lock_irqsave(, flags);
-   nlines = observed.nlines;
-
-   if (strnstr(buf, "BUG: KCSAN: ", len) && strnstr(buf, "test_", len)) {
-   /*
-* KCSAN report and related to the test.
-*
-* The provided @buf is not NUL-termina

[tip: locking/core] kcsan: Switch to KUNIT_CASE_PARAM for parameterized tests

2021-04-11 Thread tip-bot2 for Marco Elver
The following commit has been merged into the locking/core branch of tip:

Commit-ID: f6a149140321274cbd955dee50798fe191841f94
Gitweb:
https://git.kernel.org/tip/f6a149140321274cbd955dee50798fe191841f94
Author:Marco Elver 
AuthorDate:Wed, 13 Jan 2021 17:05:57 +01:00
Committer: Paul E. McKenney 
CommitterDate: Mon, 08 Mar 2021 14:27:43 -08:00

kcsan: Switch to KUNIT_CASE_PARAM for parameterized tests

Since KUnit now support parameterized tests via KUNIT_CASE_PARAM, update
KCSAN's test to switch to it for parameterized tests. This simplifies
parameterized tests and gets rid of the "parameters in case name"
workaround (hack).

At the same time, we can increase the maximum number of threads used,
because on systems with too few CPUs, KUnit allows us to now stop at the
maximum useful threads and not unnecessarily execute redundant test
cases with (the same) limited threads as had been the case before.

Reviewed-by: David Gow 
Signed-off-by: Marco Elver 
Signed-off-by: Paul E. McKenney 
---
 kernel/kcsan/kcsan_test.c | 116 +
 1 file changed, 54 insertions(+), 62 deletions(-)

diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c
index f16f632..b71751f 100644
--- a/kernel/kcsan/kcsan_test.c
+++ b/kernel/kcsan/kcsan_test.c
@@ -13,6 +13,8 @@
  * Author: Marco Elver 
  */
 
+#define pr_fmt(fmt) "kcsan_test: " fmt
+
 #include 
 #include 
 #include 
@@ -951,22 +953,53 @@ static void test_atomic_builtins(struct kunit *test)
 }
 
 /*
- * Each test case is run with different numbers of threads. Until KUnit 
supports
- * passing arguments for each test case, we encode #threads in the test case
- * name (read by get_num_threads()). [The '-' was chosen as a stylistic
- * preference to separate test name and #threads.]
+ * Generate thread counts for all test cases. Values generated are in interval
+ * [2, 5] followed by exponentially increasing thread counts from 8 to 32.
  *
  * The thread counts are chosen to cover potentially interesting boundaries and
- * corner cases (range 2-5), and then stress the system with larger counts.
+ * corner cases (2 to 5), and then stress the system with larger counts.
  */
-#define KCSAN_KUNIT_CASE(test_name)
\
-   { .run_case = test_name, .name = #test_name "-02" },   \
-   { .run_case = test_name, .name = #test_name "-03" },   \
-   { .run_case = test_name, .name = #test_name "-04" },   \
-   { .run_case = test_name, .name = #test_name "-05" },   \
-   { .run_case = test_name, .name = #test_name "-08" },   \
-   { .run_case = test_name, .name = #test_name "-16" }
+static const void *nthreads_gen_params(const void *prev, char *desc)
+{
+   long nthreads = (long)prev;
+
+   if (nthreads < 0 || nthreads >= 32)
+   nthreads = 0; /* stop */
+   else if (!nthreads)
+   nthreads = 2; /* initial value */
+   else if (nthreads < 5)
+   nthreads++;
+   else if (nthreads == 5)
+   nthreads = 8;
+   else
+   nthreads *= 2;
 
+   if (!IS_ENABLED(CONFIG_PREEMPT) || 
!IS_ENABLED(CONFIG_KCSAN_INTERRUPT_WATCHER)) {
+   /*
+* Without any preemption, keep 2 CPUs free for other tasks, one
+* of which is the main test case function checking for
+* completion or failure.
+*/
+   const long min_unused_cpus = IS_ENABLED(CONFIG_PREEMPT_NONE) ? 
2 : 0;
+   const long min_required_cpus = 2 + min_unused_cpus;
+
+   if (num_online_cpus() < min_required_cpus) {
+   pr_err_once("Too few online CPUs (%u < %d) for test\n",
+   num_online_cpus(), min_required_cpus);
+   nthreads = 0;
+   } else if (nthreads >= num_online_cpus() - min_unused_cpus) {
+   /* Use negative value to indicate last param. */
+   nthreads = -(num_online_cpus() - min_unused_cpus);
+   pr_warn_once("Limiting number of threads to %ld (only 
%d online CPUs)\n",
+-nthreads, num_online_cpus());
+   }
+   }
+
+   snprintf(desc, KUNIT_PARAM_DESC_SIZE, "threads=%ld", abs(nthreads));
+   return (void *)nthreads;
+}
+
+#define KCSAN_KUNIT_CASE(test_name) KUNIT_CASE_PARAM(test_name, 
nthreads_gen_params)
 static struct kunit_case kcsan_test_cases[] = {
KCSAN_KUNIT_CASE(test_basic),
KCSAN_KUNIT_CASE(test_concurrent_races),
@@ -996,24 +1029,6 @@ static struct kunit_case kcsan_test_cases[] = {
 
 /* = End test cases = */
 
-/* Get number of threads encoded in test name. */
-static bo

[tip: locking/core] kcsan: Add missing license and copyright headers

2021-04-11 Thread tip-bot2 for Marco Elver
The following commit has been merged into the locking/core branch of tip:

Commit-ID: bd0ccc4afca2d6ae0029cae35c4f1d2e2ade7579
Gitweb:
https://git.kernel.org/tip/bd0ccc4afca2d6ae0029cae35c4f1d2e2ade7579
Author:Marco Elver 
AuthorDate:Fri, 15 Jan 2021 18:09:53 +01:00
Committer: Paul E. McKenney 
CommitterDate: Mon, 08 Mar 2021 14:27:43 -08:00

kcsan: Add missing license and copyright headers

Adds missing license and/or copyright headers for KCSAN source files.

Signed-off-by: Marco Elver 
Signed-off-by: Paul E. McKenney 
---
 Documentation/dev-tools/kcsan.rst | 3 +++
 include/linux/kcsan-checks.h  | 6 ++
 include/linux/kcsan.h | 7 +++
 kernel/kcsan/atomic.h | 5 +
 kernel/kcsan/core.c   | 5 +
 kernel/kcsan/debugfs.c| 5 +
 kernel/kcsan/encoding.h   | 5 +
 kernel/kcsan/kcsan.h  | 3 ++-
 kernel/kcsan/report.c | 5 +
 kernel/kcsan/selftest.c   | 5 +
 10 files changed, 48 insertions(+), 1 deletion(-)

diff --git a/Documentation/dev-tools/kcsan.rst 
b/Documentation/dev-tools/kcsan.rst
index be7a0b0..d85ce23 100644
--- a/Documentation/dev-tools/kcsan.rst
+++ b/Documentation/dev-tools/kcsan.rst
@@ -1,3 +1,6 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. Copyright (C) 2019, Google LLC.
+
 The Kernel Concurrency Sanitizer (KCSAN)
 
 
diff --git a/include/linux/kcsan-checks.h b/include/linux/kcsan-checks.h
index cf14840..9fd0ad8 100644
--- a/include/linux/kcsan-checks.h
+++ b/include/linux/kcsan-checks.h
@@ -1,4 +1,10 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * KCSAN access checks and modifiers. These can be used to explicitly check
+ * uninstrumented accesses, or change KCSAN checking behaviour of accesses.
+ *
+ * Copyright (C) 2019, Google LLC.
+ */
 
 #ifndef _LINUX_KCSAN_CHECKS_H
 #define _LINUX_KCSAN_CHECKS_H
diff --git a/include/linux/kcsan.h b/include/linux/kcsan.h
index 53340d8..fc266ec 100644
--- a/include/linux/kcsan.h
+++ b/include/linux/kcsan.h
@@ -1,4 +1,11 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * The Kernel Concurrency Sanitizer (KCSAN) infrastructure. Public interface 
and
+ * data structures to set up runtime. See kcsan-checks.h for explicit checks 
and
+ * modifiers. For more info please see Documentation/dev-tools/kcsan.rst.
+ *
+ * Copyright (C) 2019, Google LLC.
+ */
 
 #ifndef _LINUX_KCSAN_H
 #define _LINUX_KCSAN_H
diff --git a/kernel/kcsan/atomic.h b/kernel/kcsan/atomic.h
index 75fe701..530ae1b 100644
--- a/kernel/kcsan/atomic.h
+++ b/kernel/kcsan/atomic.h
@@ -1,4 +1,9 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Rules for implicitly atomic memory accesses.
+ *
+ * Copyright (C) 2019, Google LLC.
+ */
 
 #ifndef _KERNEL_KCSAN_ATOMIC_H
 #define _KERNEL_KCSAN_ATOMIC_H
diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c
index 23e7acb..45c821d 100644
--- a/kernel/kcsan/core.c
+++ b/kernel/kcsan/core.c
@@ -1,4 +1,9 @@
 // SPDX-License-Identifier: GPL-2.0
+/*
+ * KCSAN core runtime.
+ *
+ * Copyright (C) 2019, Google LLC.
+ */
 
 #define pr_fmt(fmt) "kcsan: " fmt
 
diff --git a/kernel/kcsan/debugfs.c b/kernel/kcsan/debugfs.c
index 209ad8d..c1dd02f 100644
--- a/kernel/kcsan/debugfs.c
+++ b/kernel/kcsan/debugfs.c
@@ -1,4 +1,9 @@
 // SPDX-License-Identifier: GPL-2.0
+/*
+ * KCSAN debugfs interface.
+ *
+ * Copyright (C) 2019, Google LLC.
+ */
 
 #define pr_fmt(fmt) "kcsan: " fmt
 
diff --git a/kernel/kcsan/encoding.h b/kernel/kcsan/encoding.h
index 7ee4055..170a2bb 100644
--- a/kernel/kcsan/encoding.h
+++ b/kernel/kcsan/encoding.h
@@ -1,4 +1,9 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * KCSAN watchpoint encoding.
+ *
+ * Copyright (C) 2019, Google LLC.
+ */
 
 #ifndef _KERNEL_KCSAN_ENCODING_H
 #define _KERNEL_KCSAN_ENCODING_H
diff --git a/kernel/kcsan/kcsan.h b/kernel/kcsan/kcsan.h
index 87ccdb3..9881099 100644
--- a/kernel/kcsan/kcsan.h
+++ b/kernel/kcsan/kcsan.h
@@ -1,8 +1,9 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-
 /*
  * The Kernel Concurrency Sanitizer (KCSAN) infrastructure. For more info 
please
  * see Documentation/dev-tools/kcsan.rst.
+ *
+ * Copyright (C) 2019, Google LLC.
  */
 
 #ifndef _KERNEL_KCSAN_KCSAN_H
diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c
index d3bf87e..13dce3c 100644
--- a/kernel/kcsan/report.c
+++ b/kernel/kcsan/report.c
@@ -1,4 +1,9 @@
 // SPDX-License-Identifier: GPL-2.0
+/*
+ * KCSAN reporting.
+ *
+ * Copyright (C) 2019, Google LLC.
+ */
 
 #include 
 #include 
diff --git a/kernel/kcsan/selftest.c b/kernel/kcsan/selftest.c
index 9014a3a..7f29cb0 100644
--- a/kernel/kcsan/selftest.c
+++ b/kernel/kcsan/selftest.c
@@ -1,4 +1,9 @@
 // SPDX-License-Identifier: GPL-2.0
+/*
+ * KCSAN short boot-time selftests.
+ *
+ * Copyright (C) 2019, Google LLC.
+ */
 
 #define pr_fmt(fmt) "kcsan: " fmt
 


[PATCH v4 10/10] perf test: Add basic stress test for sigtrap handling

2021-04-08 Thread Marco Elver
Add basic stress test for sigtrap handling as a perf tool built-in test.
This allows sanity checking the basic sigtrap functionality from within
the perf tool.

Note: A more elaborate kselftest version of this test can also be found
in tools/testing/selftests/perf_events/sigtrap_threads.c.

Signed-off-by: Marco Elver 
---
v4:
* Update for new perf_event_attr::sig_data / si_perf handling.

v3:
* Added to series (per suggestion from Ian Rogers).
---
 tools/perf/tests/Build  |   1 +
 tools/perf/tests/builtin-test.c |   5 ++
 tools/perf/tests/sigtrap.c  | 150 
 tools/perf/tests/tests.h|   1 +
 4 files changed, 157 insertions(+)
 create mode 100644 tools/perf/tests/sigtrap.c

diff --git a/tools/perf/tests/Build b/tools/perf/tests/Build
index 650aec19d490..a429c7a02b37 100644
--- a/tools/perf/tests/Build
+++ b/tools/perf/tests/Build
@@ -64,6 +64,7 @@ perf-y += parse-metric.o
 perf-y += pe-file-parsing.o
 perf-y += expand-cgroup.o
 perf-y += perf-time-to-tsc.o
+perf-y += sigtrap.o
 
 $(OUTPUT)tests/llvm-src-base.c: tests/bpf-script-example.c tests/Build
$(call rule_mkdir)
diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c
index c4b888f18e9c..28a1cb5eaa77 100644
--- a/tools/perf/tests/builtin-test.c
+++ b/tools/perf/tests/builtin-test.c
@@ -359,6 +359,11 @@ static struct test generic_tests[] = {
.func = test__perf_time_to_tsc,
.is_supported = test__tsc_is_supported,
},
+   {
+   .desc = "Sigtrap support",
+   .func = test__sigtrap,
+   .is_supported = test__wp_is_supported, /* uses wp for test */
+   },
{
.func = NULL,
},
diff --git a/tools/perf/tests/sigtrap.c b/tools/perf/tests/sigtrap.c
new file mode 100644
index ..c367cc2f64d5
--- /dev/null
+++ b/tools/perf/tests/sigtrap.c
@@ -0,0 +1,150 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Basic test for sigtrap support.
+ *
+ * Copyright (C) 2021, Google LLC.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "cloexec.h"
+#include "debug.h"
+#include "event.h"
+#include "tests.h"
+#include "../perf-sys.h"
+
+#define NUM_THREADS 5
+
+static struct {
+   int tids_want_signal;   /* Which threads still want a signal. */
+   int signal_count;   /* Sanity check number of signals 
received. */
+   volatile int iterate_on;/* Variable to set breakpoint on. */
+   siginfo_t first_siginfo;/* First observed siginfo_t. */
+} ctx;
+
+#define TEST_SIG_DATA (~(uint64_t)(_on))
+
+static struct perf_event_attr make_event_attr(void)
+{
+   struct perf_event_attr attr = {
+   .type   = PERF_TYPE_BREAKPOINT,
+   .size   = sizeof(attr),
+   .sample_period  = 1,
+   .disabled   = 1,
+   .bp_addr= (unsigned long)_on,
+   .bp_type= HW_BREAKPOINT_RW,
+   .bp_len = HW_BREAKPOINT_LEN_1,
+   .inherit= 1, /* Children inherit events ... */
+   .inherit_thread = 1, /* ... but only cloned with CLONE_THREAD. 
*/
+   .remove_on_exec = 1, /* Required by sigtrap. */
+   .sigtrap= 1, /* Request synchronous SIGTRAP on event. */
+   .sig_data   = TEST_SIG_DATA,
+   };
+   return attr;
+}
+
+static void
+sigtrap_handler(int signum __maybe_unused, siginfo_t *info, void *ucontext 
__maybe_unused)
+{
+   if (!__atomic_fetch_add(_count, 1, __ATOMIC_RELAXED))
+   ctx.first_siginfo = *info;
+   __atomic_fetch_sub(_want_signal, syscall(SYS_gettid), 
__ATOMIC_RELAXED);
+}
+
+static void *test_thread(void *arg)
+{
+   pthread_barrier_t *barrier = (pthread_barrier_t *)arg;
+   pid_t tid = syscall(SYS_gettid);
+   int i;
+
+   pthread_barrier_wait(barrier);
+
+   __atomic_fetch_add(_want_signal, tid, __ATOMIC_RELAXED);
+   for (i = 0; i < ctx.iterate_on - 1; i++)
+   __atomic_fetch_add(_want_signal, tid, 
__ATOMIC_RELAXED);
+
+   return NULL;
+}
+
+static int run_test_threads(pthread_t *threads, pthread_barrier_t *barrier)
+{
+   int i;
+
+   pthread_barrier_wait(barrier);
+   for (i = 0; i < NUM_THREADS; i++)
+   TEST_ASSERT_EQUAL("pthread_join() failed", 
pthread_join(threads[i], NULL), 0);
+
+   return TEST_OK;
+}
+
+static int run_stress_test(int fd, pthread_t *threads, pthread_barrier_t 
*barrier)
+{
+   int ret;
+
+   ctx.iterate_on = 3000;
+
+   TEST_ASSERT_EQUAL("misfired signal?", ctx.signal_count, 0);
+   TEST_ASSERT_EQUAL("enable failed", ioctl(fd, PERF_EVENT_IOC_ENABLE, 0), 
0);
+   ret = run_test_threads(threads, barrier);
+   TEST_ASSERT_EQUAL("disable

[PATCH v4 09/10] tools headers uapi: Sync tools/include/uapi/linux/perf_event.h

2021-04-08 Thread Marco Elver
Sync tool's uapi to pick up the changes adding inherit_thread,
remove_on_exec, and sigtrap fields to perf_event_attr.

Signed-off-by: Marco Elver 
---
v4:
* Update for new perf_event_attr::sig_data.

v3:
* Added to series.
---
 tools/include/uapi/linux/perf_event.h | 12 +++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/tools/include/uapi/linux/perf_event.h 
b/tools/include/uapi/linux/perf_event.h
index ad15e40d7f5d..31b00e3b69c9 100644
--- a/tools/include/uapi/linux/perf_event.h
+++ b/tools/include/uapi/linux/perf_event.h
@@ -311,6 +311,7 @@ enum perf_event_read_format {
 #define PERF_ATTR_SIZE_VER4104 /* add: sample_regs_intr */
 #define PERF_ATTR_SIZE_VER5112 /* add: aux_watermark */
 #define PERF_ATTR_SIZE_VER6120 /* add: aux_sample_size */
+#define PERF_ATTR_SIZE_VER7128 /* add: sig_data */
 
 /*
  * Hardware event_id to monitor via a performance monitoring event:
@@ -389,7 +390,10 @@ struct perf_event_attr {
cgroup :  1, /* include cgroup events */
text_poke  :  1, /* include text poke 
events */
build_id   :  1, /* use build id in mmap2 
events */
-   __reserved_1   : 29;
+   inherit_thread :  1, /* children only inherit 
if cloned with CLONE_THREAD */
+   remove_on_exec :  1, /* event is removed from 
task on exec */
+   sigtrap:  1, /* send synchronous 
SIGTRAP on event */
+   __reserved_1   : 26;
 
union {
__u32   wakeup_events;/* wakeup every n events */
@@ -441,6 +445,12 @@ struct perf_event_attr {
__u16   __reserved_2;
__u32   aux_sample_size;
__u32   __reserved_3;
+
+   /*
+* User provided data if sigtrap=1, passed back to user via
+* siginfo_t::si_perf, e.g. to permit user to identify the event.
+*/
+   __u64   sig_data;
 };
 
 /*
-- 
2.31.0.208.g409f899ff0-goog



[PATCH v4 08/10] selftests/perf_events: Add kselftest for remove_on_exec

2021-04-08 Thread Marco Elver
Add kselftest to test that remove_on_exec removes inherited events from
child tasks.

Signed-off-by: Marco Elver 
---
v3:
* Fix for latest libc signal.h.

v2:
* Add patch to series.
---
 .../testing/selftests/perf_events/.gitignore  |   1 +
 tools/testing/selftests/perf_events/Makefile  |   2 +-
 .../selftests/perf_events/remove_on_exec.c| 260 ++
 3 files changed, 262 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/perf_events/remove_on_exec.c

diff --git a/tools/testing/selftests/perf_events/.gitignore 
b/tools/testing/selftests/perf_events/.gitignore
index 4dc43e1bd79c..790c47001e77 100644
--- a/tools/testing/selftests/perf_events/.gitignore
+++ b/tools/testing/selftests/perf_events/.gitignore
@@ -1,2 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0-only
 sigtrap_threads
+remove_on_exec
diff --git a/tools/testing/selftests/perf_events/Makefile 
b/tools/testing/selftests/perf_events/Makefile
index 973a2c39ca83..fcafa5f0d34c 100644
--- a/tools/testing/selftests/perf_events/Makefile
+++ b/tools/testing/selftests/perf_events/Makefile
@@ -2,5 +2,5 @@
 CFLAGS += -Wl,-no-as-needed -Wall -I../../../../usr/include
 LDFLAGS += -lpthread
 
-TEST_GEN_PROGS := sigtrap_threads
+TEST_GEN_PROGS := sigtrap_threads remove_on_exec
 include ../lib.mk
diff --git a/tools/testing/selftests/perf_events/remove_on_exec.c 
b/tools/testing/selftests/perf_events/remove_on_exec.c
new file mode 100644
index ..5814611a1dc7
--- /dev/null
+++ b/tools/testing/selftests/perf_events/remove_on_exec.c
@@ -0,0 +1,260 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test for remove_on_exec.
+ *
+ * Copyright (C) 2021, Google LLC.
+ */
+
+#define _GNU_SOURCE
+
+/* We need the latest siginfo from the kernel repo. */
+#include 
+#include 
+#define __have_siginfo_t 1
+#define __have_sigval_t 1
+#define __have_sigevent_t 1
+#define __siginfo_t_defined
+#define __sigval_t_defined
+#define __sigevent_t_defined
+#define _BITS_SIGINFO_CONSTS_H 1
+#define _BITS_SIGEVENT_CONSTS_H 1
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "../kselftest_harness.h"
+
+static volatile int signal_count;
+
+static struct perf_event_attr make_event_attr(void)
+{
+   struct perf_event_attr attr = {
+   .type   = PERF_TYPE_HARDWARE,
+   .size   = sizeof(attr),
+   .config = PERF_COUNT_HW_INSTRUCTIONS,
+   .sample_period  = 1000,
+   .exclude_kernel = 1,
+   .exclude_hv = 1,
+   .disabled   = 1,
+   .inherit= 1,
+   /*
+* Children normally retain their inherited event on exec; with
+* remove_on_exec, we'll remove their event, but the parent and
+* any other non-exec'd children will keep their events.
+*/
+   .remove_on_exec = 1,
+   .sigtrap= 1,
+   };
+   return attr;
+}
+
+static void sigtrap_handler(int signum, siginfo_t *info, void *ucontext)
+{
+   if (info->si_code != TRAP_PERF) {
+   fprintf(stderr, "%s: unexpected si_code %d\n", __func__, 
info->si_code);
+   return;
+   }
+
+   signal_count++;
+}
+
+FIXTURE(remove_on_exec)
+{
+   struct sigaction oldact;
+   int fd;
+};
+
+FIXTURE_SETUP(remove_on_exec)
+{
+   struct perf_event_attr attr = make_event_attr();
+   struct sigaction action = {};
+
+   signal_count = 0;
+
+   /* Initialize sigtrap handler. */
+   action.sa_flags = SA_SIGINFO | SA_NODEFER;
+   action.sa_sigaction = sigtrap_handler;
+   sigemptyset(_mask);
+   ASSERT_EQ(sigaction(SIGTRAP, , >oldact), 0);
+
+   /* Initialize perf event. */
+   self->fd = syscall(__NR_perf_event_open, , 0, -1, -1, 
PERF_FLAG_FD_CLOEXEC);
+   ASSERT_NE(self->fd, -1);
+}
+
+FIXTURE_TEARDOWN(remove_on_exec)
+{
+   close(self->fd);
+   sigaction(SIGTRAP, >oldact, NULL);
+}
+
+/* Verify event propagates to fork'd child. */
+TEST_F(remove_on_exec, fork_only)
+{
+   int status;
+   pid_t pid = fork();
+
+   if (pid == 0) {
+   ASSERT_EQ(signal_count, 0);
+   ASSERT_EQ(ioctl(self->fd, PERF_EVENT_IOC_ENABLE, 0), 0);
+   while (!signal_count);
+   _exit(42);
+   }
+
+   while (!signal_count); /* Child enables event. */
+   EXPECT_EQ(waitpid(pid, , 0), pid);
+   EXPECT_EQ(WEXITSTATUS(status), 42);
+}
+
+/*
+ * Verify that event does _not_ propagate to fork+exec'd child; event enabled
+ * after fork+exec.
+ */
+TEST_F(remove_on_exec, fork_exec_then_enable)
+{
+   pid_t pid_exec, pid_only_fork;
+   int pipefd[2];
+   int tmp;
+
+   /*
+* Non-exec child, to ensure exec does not affect inherited events of
+* other children.
+*/
+   pid_only_f

[PATCH v4 06/10] perf: Add support for SIGTRAP on perf events

2021-04-08 Thread Marco Elver
Adds bit perf_event_attr::sigtrap, which can be set to cause events to
send SIGTRAP (with si_code TRAP_PERF) to the task where the event
occurred. The primary motivation is to support synchronous signals on
perf events in the task where an event (such as breakpoints) triggered.

To distinguish perf events based on the event type, the type is set in
si_errno. For events that are associated with an address, si_addr is
copied from perf_sample_data.

The new field perf_event_attr::sig_data is copied to si_perf, which
allows user space to disambiguate which event (of the same type)
triggered the signal. For example, user space could encode the relevant
information it cares about in sig_data.

We note that the choice of an opaque u64 provides the simplest and most
flexible option. Alternatives where a reference to some user space data
is passed back suffer from the problem that modification of referenced
data (be it the event fd, or the perf_event_attr) can race with the
signal being delivered (of course, the same caveat applies if user space
decides to store a pointer in sig_data, but the ABI explicitly avoids
prescribing such a design).

Link: 
https://lore.kernel.org/lkml/ybv3rat566k+6...@hirez.programming.kicks-ass.net/
Suggested-by: Peter Zijlstra 
Acked-by: Dmitry Vyukov 
Signed-off-by: Marco Elver 
---
v4:
* Generalize setting si_perf and si_addr independent of event type;
  introduces perf_event_attr::sig_data, which can be set by user space to
  be propagated to si_perf.
* Fix race between irq_work running and task's sighand being released by
  release_task().
* Warning in perf_sigtrap() if ctx->task and current mismatch; we expect
  this on architectures that do not properly implement
  arch_irq_work_raise().
* Require events that want sigtrap to be associated with a task.

v2:
* Use atomic_set(_count, 1), since it must always be 0 in
  perf_pending_event_disable().
* Implicitly restrict inheriting events if sigtrap, but the child was
  cloned with CLONE_CLEAR_SIGHAND, because it is not generally safe if
  the child cleared all signal handlers to continue sending SIGTRAP.
---
 include/linux/perf_event.h  |  3 ++
 include/uapi/linux/perf_event.h | 10 ++-
 kernel/events/core.c| 49 -
 3 files changed, 60 insertions(+), 2 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 1660039199b2..18ba1282c5c7 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -778,6 +778,9 @@ struct perf_event {
void *security;
 #endif
struct list_headsb_list;
+
+   /* Address associated with event, which can be passed to siginfo_t. */
+   u64 sig_addr;
 #endif /* CONFIG_PERF_EVENTS */
 };
 
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 8c5b9f5ad63f..31b00e3b69c9 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -311,6 +311,7 @@ enum perf_event_read_format {
 #define PERF_ATTR_SIZE_VER4104 /* add: sample_regs_intr */
 #define PERF_ATTR_SIZE_VER5112 /* add: aux_watermark */
 #define PERF_ATTR_SIZE_VER6120 /* add: aux_sample_size */
+#define PERF_ATTR_SIZE_VER7128 /* add: sig_data */
 
 /*
  * Hardware event_id to monitor via a performance monitoring event:
@@ -391,7 +392,8 @@ struct perf_event_attr {
build_id   :  1, /* use build id in mmap2 
events */
inherit_thread :  1, /* children only inherit 
if cloned with CLONE_THREAD */
remove_on_exec :  1, /* event is removed from 
task on exec */
-   __reserved_1   : 27;
+   sigtrap:  1, /* send synchronous 
SIGTRAP on event */
+   __reserved_1   : 26;
 
union {
__u32   wakeup_events;/* wakeup every n events */
@@ -443,6 +445,12 @@ struct perf_event_attr {
__u16   __reserved_2;
__u32   aux_sample_size;
__u32   __reserved_3;
+
+   /*
+* User provided data if sigtrap=1, passed back to user via
+* siginfo_t::si_perf, e.g. to permit user to identify the event.
+*/
+   __u64   sig_data;
 };
 
 /*
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 19c045ff2b9c..1d2077389c0c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6391,6 +6391,33 @@ void perf_event_wakeup(struct perf_event *event)
}
 }
 
+static void perf_sigtrap(struct perf_event *event)
+{
+   struct kernel_siginfo info;
+
+   /*
+* We'd expect this to only occur if the irq_work is delayed and either
+* ctx->task or current has changed in the meantime. This can be the
+* case on architectures that do not implement arch_irq_work_raise().
+*/
+   if (WARN_ON_ONCE(event->ctx->tas

[PATCH v4 07/10] selftests/perf_events: Add kselftest for process-wide sigtrap handling

2021-04-08 Thread Marco Elver
Add a kselftest for testing process-wide perf events with synchronous
SIGTRAP on events (using breakpoints). In particular, we want to test
that changes to the event propagate to all children, and the SIGTRAPs
are in fact synchronously sent to the thread where the event occurred.

Note: The "signal_stress" test case is also added later in the series to
perf tool's built-in tests. The test here is more elaborate in that
respect, which on one hand avoids bloating the perf tool unnecessarily,
but we also benefit from structured tests with TAP-compliant output that
the kselftest framework provides.

Signed-off-by: Marco Elver 
---
v4:
* Update for new perf_event_attr::sig_data / si_perf handling.

v3:
* Fix for latest libc signal.h.

v2:
* Patch added to series.
---
 .../testing/selftests/perf_events/.gitignore  |   2 +
 tools/testing/selftests/perf_events/Makefile  |   6 +
 tools/testing/selftests/perf_events/config|   1 +
 tools/testing/selftests/perf_events/settings  |   1 +
 .../selftests/perf_events/sigtrap_threads.c   | 210 ++
 5 files changed, 220 insertions(+)
 create mode 100644 tools/testing/selftests/perf_events/.gitignore
 create mode 100644 tools/testing/selftests/perf_events/Makefile
 create mode 100644 tools/testing/selftests/perf_events/config
 create mode 100644 tools/testing/selftests/perf_events/settings
 create mode 100644 tools/testing/selftests/perf_events/sigtrap_threads.c

diff --git a/tools/testing/selftests/perf_events/.gitignore 
b/tools/testing/selftests/perf_events/.gitignore
new file mode 100644
index ..4dc43e1bd79c
--- /dev/null
+++ b/tools/testing/selftests/perf_events/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+sigtrap_threads
diff --git a/tools/testing/selftests/perf_events/Makefile 
b/tools/testing/selftests/perf_events/Makefile
new file mode 100644
index ..973a2c39ca83
--- /dev/null
+++ b/tools/testing/selftests/perf_events/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0
+CFLAGS += -Wl,-no-as-needed -Wall -I../../../../usr/include
+LDFLAGS += -lpthread
+
+TEST_GEN_PROGS := sigtrap_threads
+include ../lib.mk
diff --git a/tools/testing/selftests/perf_events/config 
b/tools/testing/selftests/perf_events/config
new file mode 100644
index ..ba58ff2203e4
--- /dev/null
+++ b/tools/testing/selftests/perf_events/config
@@ -0,0 +1 @@
+CONFIG_PERF_EVENTS=y
diff --git a/tools/testing/selftests/perf_events/settings 
b/tools/testing/selftests/perf_events/settings
new file mode 100644
index ..6091b45d226b
--- /dev/null
+++ b/tools/testing/selftests/perf_events/settings
@@ -0,0 +1 @@
+timeout=120
diff --git a/tools/testing/selftests/perf_events/sigtrap_threads.c 
b/tools/testing/selftests/perf_events/sigtrap_threads.c
new file mode 100644
index ..9c0fd442da60
--- /dev/null
+++ b/tools/testing/selftests/perf_events/sigtrap_threads.c
@@ -0,0 +1,210 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test for perf events with SIGTRAP across all threads.
+ *
+ * Copyright (C) 2021, Google LLC.
+ */
+
+#define _GNU_SOURCE
+
+/* We need the latest siginfo from the kernel repo. */
+#include 
+#include 
+#define __have_siginfo_t 1
+#define __have_sigval_t 1
+#define __have_sigevent_t 1
+#define __siginfo_t_defined
+#define __sigval_t_defined
+#define __sigevent_t_defined
+#define _BITS_SIGINFO_CONSTS_H 1
+#define _BITS_SIGEVENT_CONSTS_H 1
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "../kselftest_harness.h"
+
+#define NUM_THREADS 5
+
+/* Data shared between test body, threads, and signal handler. */
+static struct {
+   int tids_want_signal;   /* Which threads still want a signal. */
+   int signal_count;   /* Sanity check number of signals 
received. */
+   volatile int iterate_on;/* Variable to set breakpoint on. */
+   siginfo_t first_siginfo;/* First observed siginfo_t. */
+} ctx;
+
+/* Unique value to check si_perf is correctly set from 
perf_event_attr::sig_data. */
+#define TEST_SIG_DATA(addr) (~(uint64_t)(addr))
+
+static struct perf_event_attr make_event_attr(bool enabled, volatile void 
*addr)
+{
+   struct perf_event_attr attr = {
+   .type   = PERF_TYPE_BREAKPOINT,
+   .size   = sizeof(attr),
+   .sample_period  = 1,
+   .disabled   = !enabled,
+   .bp_addr= (unsigned long)addr,
+   .bp_type= HW_BREAKPOINT_RW,
+   .bp_len = HW_BREAKPOINT_LEN_1,
+   .inherit= 1, /* Children inherit events ... */
+   .inherit_thread = 1, /* ... but only cloned with CLONE_THREAD. 
*/
+   .remove_on_exec = 1, /* Required by sigtrap. */
+   .sigtrap= 1, /* Request synchronous SIGTRAP on event. */
+   .sig_data   

[PATCH v4 04/10] perf: Add support for event removal on exec

2021-04-08 Thread Marco Elver
Adds bit perf_event_attr::remove_on_exec, to support removing an event
from a task on exec.

This option supports the case where an event is supposed to be
process-wide only, and should not propagate beyond exec, to limit
monitoring to the original process image only.

Suggested-by: Peter Zijlstra 
Signed-off-by: Marco Elver 
---
v3:
* Rework based on Peter's "perf: Rework perf_event_exit_event()" added
  to the beginning of the series. Intermediate attempts between v2 and
  this v3 can be found here:
  https://lkml.kernel.org/r/yfm6aaksrlf2n...@elver.google.com

v2:
* Add patch to series.
---
 include/uapi/linux/perf_event.h |  3 +-
 kernel/events/core.c| 70 +
 2 files changed, 64 insertions(+), 9 deletions(-)

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 813efb65fea8..8c5b9f5ad63f 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -390,7 +390,8 @@ struct perf_event_attr {
text_poke  :  1, /* include text poke 
events */
build_id   :  1, /* use build id in mmap2 
events */
inherit_thread :  1, /* children only inherit 
if cloned with CLONE_THREAD */
-   __reserved_1   : 28;
+   remove_on_exec :  1, /* event is removed from 
task on exec */
+   __reserved_1   : 27;
 
union {
__u32   wakeup_events;/* wakeup every n events */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index de2917b3c59e..19c045ff2b9c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4247,6 +4247,57 @@ static void perf_event_enable_on_exec(int ctxn)
put_ctx(clone_ctx);
 }
 
+static void perf_remove_from_owner(struct perf_event *event);
+static void perf_event_exit_event(struct perf_event *event,
+ struct perf_event_context *ctx);
+
+/*
+ * Removes all events from the current task that have been marked
+ * remove-on-exec, and feeds their values back to parent events.
+ */
+static void perf_event_remove_on_exec(int ctxn)
+{
+   struct perf_event_context *ctx, *clone_ctx = NULL;
+   struct perf_event *event, *next;
+   LIST_HEAD(free_list);
+   unsigned long flags;
+   bool modified = false;
+
+   ctx = perf_pin_task_context(current, ctxn);
+   if (!ctx)
+   return;
+
+   mutex_lock(>mutex);
+
+   if (WARN_ON_ONCE(ctx->task != current))
+   goto unlock;
+
+   list_for_each_entry_safe(event, next, >event_list, event_entry) {
+   if (!event->attr.remove_on_exec)
+   continue;
+
+   if (!is_kernel_event(event))
+   perf_remove_from_owner(event);
+
+   modified = true;
+
+   perf_event_exit_event(event, ctx);
+   }
+
+   raw_spin_lock_irqsave(>lock, flags);
+   if (modified)
+   clone_ctx = unclone_ctx(ctx);
+   --ctx->pin_count;
+   raw_spin_unlock_irqrestore(>lock, flags);
+
+unlock:
+   mutex_unlock(>mutex);
+
+   put_ctx(ctx);
+   if (clone_ctx)
+   put_ctx(clone_ctx);
+}
+
 struct perf_read_data {
struct perf_event *event;
bool group;
@@ -7559,18 +7610,18 @@ void perf_event_exec(void)
struct perf_event_context *ctx;
int ctxn;
 
-   rcu_read_lock();
for_each_task_context_nr(ctxn) {
-   ctx = current->perf_event_ctxp[ctxn];
-   if (!ctx)
-   continue;
-
perf_event_enable_on_exec(ctxn);
+   perf_event_remove_on_exec(ctxn);
 
-   perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL,
-  true);
+   rcu_read_lock();
+   ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
+   if (ctx) {
+   perf_iterate_ctx(ctx, perf_event_addr_filters_exec,
+NULL, true);
+   }
+   rcu_read_unlock();
}
-   rcu_read_unlock();
 }
 
 struct remote_output {
@@ -11652,6 +11703,9 @@ static int perf_copy_attr(struct perf_event_attr __user 
*uattr,
if (!attr->inherit && attr->inherit_thread)
return -EINVAL;
 
+   if (attr->remove_on_exec && attr->enable_on_exec)
+   return -EINVAL;
+
 out:
return ret;
 
-- 
2.31.0.208.g409f899ff0-goog



[PATCH v4 05/10] signal: Introduce TRAP_PERF si_code and si_perf to siginfo

2021-04-08 Thread Marco Elver
Introduces the TRAP_PERF si_code, and associated siginfo_t field
si_perf. These will be used by the perf event subsystem to send signals
(if requested) to the task where an event occurred.

Acked-by: Geert Uytterhoeven  # m68k
Acked-by: Arnd Bergmann  # asm-generic
Signed-off-by: Marco Elver 
---
 arch/m68k/kernel/signal.c  |  3 +++
 arch/x86/kernel/signal_compat.c|  5 -
 fs/signalfd.c  |  4 
 include/linux/compat.h |  2 ++
 include/linux/signal.h |  1 +
 include/uapi/asm-generic/siginfo.h |  6 +-
 include/uapi/linux/signalfd.h  |  4 +++-
 kernel/signal.c| 11 +++
 8 files changed, 33 insertions(+), 3 deletions(-)

diff --git a/arch/m68k/kernel/signal.c b/arch/m68k/kernel/signal.c
index 349570f16a78..a4b7ee1df211 100644
--- a/arch/m68k/kernel/signal.c
+++ b/arch/m68k/kernel/signal.c
@@ -622,6 +622,9 @@ static inline void siginfo_build_tests(void)
/* _sigfault._addr_pkey */
BUILD_BUG_ON(offsetof(siginfo_t, si_pkey) != 0x12);
 
+   /* _sigfault._perf */
+   BUILD_BUG_ON(offsetof(siginfo_t, si_perf) != 0x10);
+
/* _sigpoll */
BUILD_BUG_ON(offsetof(siginfo_t, si_band)   != 0x0c);
BUILD_BUG_ON(offsetof(siginfo_t, si_fd) != 0x10);
diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c
index a5330ff498f0..0e5d0a7e203b 100644
--- a/arch/x86/kernel/signal_compat.c
+++ b/arch/x86/kernel/signal_compat.c
@@ -29,7 +29,7 @@ static inline void signal_compat_build_tests(void)
BUILD_BUG_ON(NSIGFPE  != 15);
BUILD_BUG_ON(NSIGSEGV != 9);
BUILD_BUG_ON(NSIGBUS  != 5);
-   BUILD_BUG_ON(NSIGTRAP != 5);
+   BUILD_BUG_ON(NSIGTRAP != 6);
BUILD_BUG_ON(NSIGCHLD != 6);
BUILD_BUG_ON(NSIGSYS  != 2);
 
@@ -138,6 +138,9 @@ static inline void signal_compat_build_tests(void)
BUILD_BUG_ON(offsetof(siginfo_t, si_pkey) != 0x20);
BUILD_BUG_ON(offsetof(compat_siginfo_t, si_pkey) != 0x14);
 
+   BUILD_BUG_ON(offsetof(siginfo_t, si_perf) != 0x18);
+   BUILD_BUG_ON(offsetof(compat_siginfo_t, si_perf) != 0x10);
+
CHECK_CSI_OFFSET(_sigpoll);
CHECK_CSI_SIZE  (_sigpoll, 2*sizeof(int));
CHECK_SI_SIZE   (_sigpoll, 4*sizeof(int));
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 456046e15873..040a1142915f 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -134,6 +134,10 @@ static int signalfd_copyinfo(struct signalfd_siginfo 
__user *uinfo,
 #endif
new.ssi_addr_lsb = (short) kinfo->si_addr_lsb;
break;
+   case SIL_PERF_EVENT:
+   new.ssi_addr = (long) kinfo->si_addr;
+   new.ssi_perf = kinfo->si_perf;
+   break;
case SIL_CHLD:
new.ssi_pid= kinfo->si_pid;
new.ssi_uid= kinfo->si_uid;
diff --git a/include/linux/compat.h b/include/linux/compat.h
index 6e65be753603..c8821d966812 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -236,6 +236,8 @@ typedef struct compat_siginfo {
char 
_dummy_pkey[__COMPAT_ADDR_BND_PKEY_PAD];
u32 _pkey;
} _addr_pkey;
+   /* used when si_code=TRAP_PERF */
+   compat_u64 _perf;
};
} _sigfault;
 
diff --git a/include/linux/signal.h b/include/linux/signal.h
index 205526c4003a..1e98548d7cf6 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -43,6 +43,7 @@ enum siginfo_layout {
SIL_FAULT_MCEERR,
SIL_FAULT_BNDERR,
SIL_FAULT_PKUERR,
+   SIL_PERF_EVENT,
SIL_CHLD,
SIL_RT,
SIL_SYS,
diff --git a/include/uapi/asm-generic/siginfo.h 
b/include/uapi/asm-generic/siginfo.h
index d2597000407a..d0bb9125c853 100644
--- a/include/uapi/asm-generic/siginfo.h
+++ b/include/uapi/asm-generic/siginfo.h
@@ -91,6 +91,8 @@ union __sifields {
char _dummy_pkey[__ADDR_BND_PKEY_PAD];
__u32 _pkey;
} _addr_pkey;
+   /* used when si_code=TRAP_PERF */
+   __u64 _perf;
};
} _sigfault;
 
@@ -155,6 +157,7 @@ typedef struct siginfo {
 #define si_lower   _sifields._sigfault._addr_bnd._lower
 #define si_upper   _sifields._sigfault._addr_bnd._upper
 #define si_pkey_sifields._sigfault._addr_pkey._pkey
+#define si_perf_sifields._sigfault._perf
 #define si_band_sifields._sigpoll._band
 #define si_fd  _sifields._sigpoll._fd
 #define si_call_addr   _sifields._sigsys._call_addr
@@ -253,7 +256,8 @@ typedef struct siginfo {
 #define TRAP_BRANCH 3  /* process taken branch trap */
 #define TRAP_HWBKPT 4  /* hardware breakpoint/watchpoint */
 #define TRAP_UNK   5  

[PATCH v4 03/10] perf: Support only inheriting events if cloned with CLONE_THREAD

2021-04-08 Thread Marco Elver
Adds bit perf_event_attr::inherit_thread, to restricting inheriting
events only if the child was cloned with CLONE_THREAD.

This option supports the case where an event is supposed to be
process-wide only (including subthreads), but should not propagate
beyond the current process's shared environment.

Link: 
https://lore.kernel.org/lkml/ybvj6ejr%2fdy2t...@hirez.programming.kicks-ass.net/
Suggested-by: Peter Zijlstra 
Signed-off-by: Marco Elver 
---
v2:
* Add patch to series.
---
 include/linux/perf_event.h  |  5 +++--
 include/uapi/linux/perf_event.h |  3 ++-
 kernel/events/core.c| 21 ++---
 kernel/fork.c   |  2 +-
 4 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 3d478abf411c..1660039199b2 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -958,7 +958,7 @@ extern void __perf_event_task_sched_in(struct task_struct 
*prev,
   struct task_struct *task);
 extern void __perf_event_task_sched_out(struct task_struct *prev,
struct task_struct *next);
-extern int perf_event_init_task(struct task_struct *child);
+extern int perf_event_init_task(struct task_struct *child, u64 clone_flags);
 extern void perf_event_exit_task(struct task_struct *child);
 extern void perf_event_free_task(struct task_struct *task);
 extern void perf_event_delayed_put(struct task_struct *task);
@@ -1449,7 +1449,8 @@ perf_event_task_sched_in(struct task_struct *prev,
 static inline void
 perf_event_task_sched_out(struct task_struct *prev,
  struct task_struct *next) { }
-static inline int perf_event_init_task(struct task_struct *child)  { 
return 0; }
+static inline int perf_event_init_task(struct task_struct *child,
+  u64 clone_flags) { 
return 0; }
 static inline void perf_event_exit_task(struct task_struct *child) { }
 static inline void perf_event_free_task(struct task_struct *task)  { }
 static inline void perf_event_delayed_put(struct task_struct *task){ }
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index ad15e40d7f5d..813efb65fea8 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -389,7 +389,8 @@ struct perf_event_attr {
cgroup :  1, /* include cgroup events */
text_poke  :  1, /* include text poke 
events */
build_id   :  1, /* use build id in mmap2 
events */
-   __reserved_1   : 29;
+   inherit_thread :  1, /* children only inherit 
if cloned with CLONE_THREAD */
+   __reserved_1   : 28;
 
union {
__u32   wakeup_events;/* wakeup every n events */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index a9a0a46909af..de2917b3c59e 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -11649,6 +11649,9 @@ static int perf_copy_attr(struct perf_event_attr __user 
*uattr,
(attr->sample_type & PERF_SAMPLE_WEIGHT_STRUCT))
return -EINVAL;
 
+   if (!attr->inherit && attr->inherit_thread)
+   return -EINVAL;
+
 out:
return ret;
 
@@ -12869,12 +12872,13 @@ static int
 inherit_task_group(struct perf_event *event, struct task_struct *parent,
   struct perf_event_context *parent_ctx,
   struct task_struct *child, int ctxn,
-  int *inherited_all)
+  u64 clone_flags, int *inherited_all)
 {
int ret;
struct perf_event_context *child_ctx;
 
-   if (!event->attr.inherit) {
+   if (!event->attr.inherit ||
+   (event->attr.inherit_thread && !(clone_flags & CLONE_THREAD))) {
*inherited_all = 0;
return 0;
}
@@ -12906,7 +12910,8 @@ inherit_task_group(struct perf_event *event, struct 
task_struct *parent,
 /*
  * Initialize the perf_event context in task_struct
  */
-static int perf_event_init_context(struct task_struct *child, int ctxn)
+static int perf_event_init_context(struct task_struct *child, int ctxn,
+  u64 clone_flags)
 {
struct perf_event_context *child_ctx, *parent_ctx;
struct perf_event_context *cloned_ctx;
@@ -12946,7 +12951,8 @@ static int perf_event_init_context(struct task_struct 
*child, int ctxn)
 */
perf_event_groups_for_each(event, _ctx->pinned_groups) {
ret = inherit_task_group(event, parent, parent_ctx,
-child, ctxn, _all);
+child, ctxn, clone_flags,
+ 

[PATCH v4 02/10] perf: Apply PERF_EVENT_IOC_MODIFY_ATTRIBUTES to children

2021-04-08 Thread Marco Elver
As with other ioctls (such as PERF_EVENT_IOC_{ENABLE,DISABLE}), fix up
handling of PERF_EVENT_IOC_MODIFY_ATTRIBUTES to also apply to children.

Suggested-by: Dmitry Vyukov 
Reviewed-by: Dmitry Vyukov 
Signed-off-by: Marco Elver 
---
 kernel/events/core.c | 22 +-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index e77294c7e654..a9a0a46909af 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3199,16 +3199,36 @@ static int perf_event_modify_breakpoint(struct 
perf_event *bp,
 static int perf_event_modify_attr(struct perf_event *event,
  struct perf_event_attr *attr)
 {
+   int (*func)(struct perf_event *, struct perf_event_attr *);
+   struct perf_event *child;
+   int err;
+
if (event->attr.type != attr->type)
return -EINVAL;
 
switch (event->attr.type) {
case PERF_TYPE_BREAKPOINT:
-   return perf_event_modify_breakpoint(event, attr);
+   func = perf_event_modify_breakpoint;
+   break;
default:
/* Place holder for future additions. */
return -EOPNOTSUPP;
}
+
+   WARN_ON_ONCE(event->ctx->parent_ctx);
+
+   mutex_lock(>child_mutex);
+   err = func(event, attr);
+   if (err)
+   goto out;
+   list_for_each_entry(child, >child_list, child_list) {
+   err = func(child, attr);
+   if (err)
+   goto out;
+   }
+out:
+   mutex_unlock(>child_mutex);
+   return err;
 }
 
 static void ctx_sched_out(struct perf_event_context *ctx,
-- 
2.31.0.208.g409f899ff0-goog



[PATCH v4 01/10] perf: Rework perf_event_exit_event()

2021-04-08 Thread Marco Elver
From: Peter Zijlstra 

Make perf_event_exit_event() more robust, such that we can use it from
other contexts. Specifically the up and coming remove_on_exec.

For this to work we need to address a few issues. Remove_on_exec will
not destroy the entire context, so we cannot rely on TASK_TOMBSTONE to
disable event_function_call() and we thus have to use
perf_remove_from_context().

When using perf_remove_from_context(), there's two races to consider.
The first is against close(), where we can have concurrent tear-down
of the event. The second is against child_list iteration, which should
not find a half baked event.

To address this, teach perf_remove_from_context() to special case
!ctx->is_active and about DETACH_CHILD.

Signed-off-by: Peter Zijlstra (Intel) 
[ el...@google.com: fix racing parent/child exit in sync_child_event(). ]
Signed-off-by: Marco Elver 
---
v4:
* Fix for parent and child racing to exit in sync_child_event().

v3:
* New dependency for series:
  https://lkml.kernel.org/r/YFn/i3akf+toj...@hirez.programming.kicks-ass.net
---
 include/linux/perf_event.h |   1 +
 kernel/events/core.c   | 142 +
 2 files changed, 80 insertions(+), 63 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 3f7f89ea5e51..3d478abf411c 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -607,6 +607,7 @@ struct swevent_hlist {
 #define PERF_ATTACH_TASK_DATA  0x08
 #define PERF_ATTACH_ITRACE 0x10
 #define PERF_ATTACH_SCHED_CB   0x20
+#define PERF_ATTACH_CHILD  0x40
 
 struct perf_cgroup;
 struct perf_buffer;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 03db40f6cba9..e77294c7e654 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2204,6 +2204,26 @@ static void perf_group_detach(struct perf_event *event)
perf_event__header_size(leader);
 }
 
+static void sync_child_event(struct perf_event *child_event);
+
+static void perf_child_detach(struct perf_event *event)
+{
+   struct perf_event *parent_event = event->parent;
+
+   if (!(event->attach_state & PERF_ATTACH_CHILD))
+   return;
+
+   event->attach_state &= ~PERF_ATTACH_CHILD;
+
+   if (WARN_ON_ONCE(!parent_event))
+   return;
+
+   lockdep_assert_held(_event->child_mutex);
+
+   sync_child_event(event);
+   list_del_init(>child_list);
+}
+
 static bool is_orphaned_event(struct perf_event *event)
 {
return event->state == PERF_EVENT_STATE_DEAD;
@@ -2311,6 +2331,7 @@ group_sched_out(struct perf_event *group_event,
 }
 
 #define DETACH_GROUP   0x01UL
+#define DETACH_CHILD   0x02UL
 
 /*
  * Cross CPU call to remove a performance event
@@ -2334,6 +2355,8 @@ __perf_remove_from_context(struct perf_event *event,
event_sched_out(event, cpuctx, ctx);
if (flags & DETACH_GROUP)
perf_group_detach(event);
+   if (flags & DETACH_CHILD)
+   perf_child_detach(event);
list_del_event(event, ctx);
 
if (!ctx->nr_events && ctx->is_active) {
@@ -2362,25 +2385,21 @@ static void perf_remove_from_context(struct perf_event 
*event, unsigned long fla
 
lockdep_assert_held(>mutex);
 
-   event_function_call(event, __perf_remove_from_context, (void *)flags);
-
/*
-* The above event_function_call() can NO-OP when it hits
-* TASK_TOMBSTONE. In that case we must already have been detached
-* from the context (by perf_event_exit_event()) but the grouping
-* might still be in-tact.
+* Because of perf_event_exit_task(), perf_remove_from_context() ought
+* to work in the face of TASK_TOMBSTONE, unlike every other
+* event_function_call() user.
 */
-   WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
-   if ((flags & DETACH_GROUP) &&
-   (event->attach_state & PERF_ATTACH_GROUP)) {
-   /*
-* Since in that case we cannot possibly be scheduled, simply
-* detach now.
-*/
-   raw_spin_lock_irq(>lock);
-   perf_group_detach(event);
+   raw_spin_lock_irq(>lock);
+   if (!ctx->is_active) {
+   __perf_remove_from_context(event, __get_cpu_context(ctx),
+  ctx, (void *)flags);
raw_spin_unlock_irq(>lock);
+   return;
}
+   raw_spin_unlock_irq(>lock);
+
+   event_function_call(event, __perf_remove_from_context, (void *)flags);
 }
 
 /*
@@ -12373,14 +12392,17 @@ void perf_pmu_migrate_context(struct pmu *pmu, int 
src_cpu, int dst_cpu)
 }
 EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
 
-static void sync_child_event(struct perf_event *child_event,
-  struct task_struct *child)
+static void sync_child_event(struct perf_event *child

[PATCH v4 00/10] Add support for synchronous signals on perf events

2021-04-08 Thread Marco Elver
quot;Add support for SIGTRAP on perf events" to trigger
the signal was suggested by Peter Zijlstra in [3].

[2] 
https://lore.kernel.org/lkml/CACT4Y+YPrXGw+AtESxAgPyZ84TYkNZdP0xpocX2jwVAbZD=-x...@mail.gmail.com/

[3] 
https://lore.kernel.org/lkml/ybv3rat566k+6...@hirez.programming.kicks-ass.net/

Marco Elver (9):
  perf: Apply PERF_EVENT_IOC_MODIFY_ATTRIBUTES to children
  perf: Support only inheriting events if cloned with CLONE_THREAD
  perf: Add support for event removal on exec
  signal: Introduce TRAP_PERF si_code and si_perf to siginfo
  perf: Add support for SIGTRAP on perf events
  selftests/perf_events: Add kselftest for process-wide sigtrap handling
  selftests/perf_events: Add kselftest for remove_on_exec
  tools headers uapi: Sync tools/include/uapi/linux/perf_event.h
  perf test: Add basic stress test for sigtrap handling

Peter Zijlstra (1):
  perf: Rework perf_event_exit_event()

 arch/m68k/kernel/signal.c |   3 +
 arch/x86/kernel/signal_compat.c   |   5 +-
 fs/signalfd.c |   4 +
 include/linux/compat.h|   2 +
 include/linux/perf_event.h|   9 +-
 include/linux/signal.h|   1 +
 include/uapi/asm-generic/siginfo.h|   6 +-
 include/uapi/linux/perf_event.h   |  12 +-
 include/uapi/linux/signalfd.h |   4 +-
 kernel/events/core.c  | 302 +-
 kernel/fork.c |   2 +-
 kernel/signal.c   |  11 +
 tools/include/uapi/linux/perf_event.h |  12 +-
 tools/perf/tests/Build|   1 +
 tools/perf/tests/builtin-test.c   |   5 +
 tools/perf/tests/sigtrap.c| 150 +
 tools/perf/tests/tests.h  |   1 +
 .../testing/selftests/perf_events/.gitignore  |   3 +
 tools/testing/selftests/perf_events/Makefile  |   6 +
 tools/testing/selftests/perf_events/config|   1 +
 .../selftests/perf_events/remove_on_exec.c| 260 +++
 tools/testing/selftests/perf_events/settings  |   1 +
 .../selftests/perf_events/sigtrap_threads.c   | 210 
 23 files changed, 924 insertions(+), 87 deletions(-)
 create mode 100644 tools/perf/tests/sigtrap.c
 create mode 100644 tools/testing/selftests/perf_events/.gitignore
 create mode 100644 tools/testing/selftests/perf_events/Makefile
 create mode 100644 tools/testing/selftests/perf_events/config
 create mode 100644 tools/testing/selftests/perf_events/remove_on_exec.c
 create mode 100644 tools/testing/selftests/perf_events/settings
 create mode 100644 tools/testing/selftests/perf_events/sigtrap_threads.c

-- 
2.31.0.208.g409f899ff0-goog



Re: [PATCH v3 1/2] kunit: add a KUnit test for SLUB debugging functionality

2021-04-08 Thread Marco Elver
On Tue, 6 Apr 2021 at 12:57, Vlastimil Babka  wrote:
>
>
> On 4/1/21 11:24 PM, Marco Elver wrote:
> > On Thu, 1 Apr 2021 at 21:04, Daniel Latypov  wrote:
> >> > }
> >> > #else
> >> > static inline bool slab_add_kunit_errors(void) { return false; }
> >> > #endif
> >> >
> >> > And anywhere you want to increase the error count, you'd call
> >> > slab_add_kunit_errors().
> >> >
> >> > Another benefit of this approach is that if KUnit is disabled, there is
> >> > zero overhead and no additional code generated (vs. the current
> >> > approach).
> >>
> >> The resource approach looks really good, but...
> >> You'd be picking up a dependency on
> >> https://lore.kernel.org/linux-kselftest/20210311152314.3814916-2-dlaty...@google.com/
> >> current->kunit_test will always be NULL unless CONFIG_KASAN=y &&
> >> CONFIG_KUNIT=y at the moment.
> >> My patch drops the CONFIG_KASAN requirement and opens it up to all tests.
> >
> > Oh, that's a shame, but hopefully it'll be in -next soon.
> >
> >> At the moment, it's just waiting another look over from Brendan or David.
> >> Any ETA on that, folks? :)
> >>
> >> So if you don't want to get blocked on that for now, I think it's fine to 
> >> add:
> >>   #ifdef CONFIG_SLUB_KUNIT_TEST
> >>   int errors;
> >>   #endif
> >
> > Until kunit fixes setting current->kunit_test, a cleaner workaround
> > that would allow to do the patch with kunit_resource, is to just have
> > an .init/.exit function that sets it ("current->kunit_test = test;").
> > And then perhaps add a note ("FIXME: ...") to remove it once the above
> > patch has landed.
> >
> > At least that way we get the least intrusive change for mm/slub.c, and
> > the test is the only thing that needs a 2-line patch to clean up
> > later.
>
> So when testing internally Oliver's new version with your suggestions (thanks
> again for those), I got lockdep splats because slab_add_kunit_errors is called
> also from irq disabled contexts, and kunit_find_named_resource will call
> spin_lock(>lock) that's not irq safe. Can we make the lock irq safe? I
> tried the change below and it makde the problem go away. If you agree, the
> question is how to proceed - make it part of Oliver's patch series and let
> Andrew pick it all with eventually kunit team's acks on this patch, or 
> whatnot.

>From what I can tell it should be fine to make it irq safe (ack for
your patch below). Regarding patch logistics, I'd probably add it to
the series. If that ends up not working, we'll find out sooner or
later.

(FYI, the prerequisite patch for current->kunit_test is in -next now.)

KUnit maintainers, do you have any preferences?

> 8<
>
> commit ab28505477892e9824c57ac338c88aec2ec0abce
> Author: Vlastimil Babka 
> Date:   Tue Apr 6 12:28:07 2021 +0200
>
> kunit: make test->lock irq safe
>
> diff --git a/include/kunit/test.h b/include/kunit/test.h
> index 49601c4b98b8..524d4789af22 100644
> --- a/include/kunit/test.h
> +++ b/include/kunit/test.h
> @@ -515,8 +515,9 @@ kunit_find_resource(struct kunit *test,
> void *match_data)
>  {
> struct kunit_resource *res, *found = NULL;
> +   unsigned long flags;
>
> -   spin_lock(>lock);
> +   spin_lock_irqsave(>lock, flags);
>
> list_for_each_entry_reverse(res, >resources, node) {
> if (match(test, res, (void *)match_data)) {
> @@ -526,7 +527,7 @@ kunit_find_resource(struct kunit *test,
> }
> }
>
> -   spin_unlock(>lock);
> +   spin_unlock_irqrestore(>lock, flags);
>
> return found;
>  }
> diff --git a/lib/kunit/test.c b/lib/kunit/test.c
> index ec9494e914ef..2c62eeb45b82 100644
> --- a/lib/kunit/test.c
> +++ b/lib/kunit/test.c
> @@ -442,6 +442,7 @@ int kunit_add_resource(struct kunit *test,
>void *data)
>  {
> int ret = 0;
> +   unsigned long flags;
>
> res->free = free;
> kref_init(>refcount);
> @@ -454,10 +455,10 @@ int kunit_add_resource(struct kunit *test,
> res->data = data;
> }
>
> -   spin_lock(>lock);
> +   spin_lock_irqsave(>lock, flags);
> list_add_tail(>node, >resources);
> /* refcount for list is established by kref_init() */
> -   spin_unlock(>lock);
> +   spin_unlock_irqrestore(>lock, flags);
>
&g

Re: [PATCH v2] kasan: fix kasan_byte_accessible() to be consistent with actual checks

2021-04-05 Thread Marco Elver
On Tue, 6 Apr 2021 at 00:43, Andrey Konovalov  wrote:
> On Tue, Apr 6, 2021 at 12:07 AM Peter Collingbourne  wrote:
> >
> > We can sometimes end up with kasan_byte_accessible() being called
> > on non-slab memory. For example ksize() and krealloc() may end up
> > calling it on KFENCE allocated memory. In this case the memory will
> > be tagged with KASAN_SHADOW_INIT, which a subsequent patch ("kasan:
> > initialize shadow to TAG_INVALID for SW_TAGS") will set to the same
> > value as KASAN_TAG_INVALID, causing kasan_byte_accessible() to fail
> > when called on non-slab memory.
> >
> > This highlighted the fact that the check in kasan_byte_accessible()
> > was inconsistent with checks as implemented for loads and stores
> > (kasan_check_range() in SW tags mode and hardware-implemented
> > checks in HW tags mode). kasan_check_range() does not have a
> > check for KASAN_TAG_INVALID, and instead has a comparison against
> > KASAN_SHADOW_START. In HW tags mode, we do not have either, but we
> > do set TCR_EL1.TCMA which corresponds with the comparison against
> > KASAN_TAG_KERNEL.
> >
> > Therefore, update kasan_byte_accessible() for both SW and HW tags
> > modes to correspond with the respective checks on loads and stores.
> >
> > Link: 
> > https://linux-review.googlesource.com/id/Ic6d40803c57dcc6331bd97fbb9a60b0d38a65a36
> > Signed-off-by: Peter Collingbourne 
> > ---
> >  mm/kasan/kasan.h   |  3 +--
> >  mm/kasan/sw_tags.c | 10 +++---
> >  2 files changed, 8 insertions(+), 5 deletions(-)
> >
> > diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
> > index 8c55634d6edd..e18e8da35255 100644
> > --- a/mm/kasan/kasan.h
> > +++ b/mm/kasan/kasan.h
> > @@ -368,8 +368,7 @@ static inline bool kasan_byte_accessible(const void 
> > *addr)
> > u8 ptr_tag = get_tag(addr);
> > u8 mem_tag = hw_get_mem_tag((void *)addr);
> >
> > -   return (mem_tag != KASAN_TAG_INVALID) &&
> > -   (ptr_tag == KASAN_TAG_KERNEL || ptr_tag == mem_tag);
> > +   return ptr_tag == KASAN_TAG_KERNEL || ptr_tag == mem_tag;
> >  }
> >
> >  #else /* CONFIG_KASAN_HW_TAGS */
> > diff --git a/mm/kasan/sw_tags.c b/mm/kasan/sw_tags.c
> > index 94c2d33be333..00ae8913fc74 100644
> > --- a/mm/kasan/sw_tags.c
> > +++ b/mm/kasan/sw_tags.c
> > @@ -121,10 +121,14 @@ bool kasan_check_range(unsigned long addr, size_t 
> > size, bool write,
> >  bool kasan_byte_accessible(const void *addr)
> >  {
> > u8 tag = get_tag(addr);
> > -   u8 shadow_byte = READ_ONCE(*(u8 
> > *)kasan_mem_to_shadow(kasan_reset_tag(addr)));
> > +   void *untagged_addr = kasan_reset_tag(addr);
> > +   u8 shadow_byte;
> >
> > -   return (shadow_byte != KASAN_TAG_INVALID) &&
> > -   (tag == KASAN_TAG_KERNEL || tag == shadow_byte);
> > +   if (untagged_addr < kasan_shadow_to_mem((void *)KASAN_SHADOW_START))
> > +   return false;
> > +
> > +   shadow_byte = READ_ONCE(*(u8 *)kasan_mem_to_shadow(untagged_addr));
> > +   return tag == KASAN_TAG_KERNEL || tag == shadow_byte;
> >  }
> >
> >  #define DEFINE_HWASAN_LOAD_STORE(size) \
> > --
> > 2.31.0.208.g409f899ff0-goog
> >
>
> Reviewed-by: Andrey Konovalov 
>
> Thank you, Peter!

Reviewed-by: Marco Elver 

Thanks!


Re: [PATCH] kfence: unpoison pool region before use

2021-04-03 Thread Marco Elver
On Sat, 3 Apr 2021 at 22:40, Peter Collingbourne  wrote:
> On Sat, Apr 3, 2021 at 3:03 AM Marco Elver  wrote:
> > On Sat, 3 Apr 2021 at 07:13, Peter Collingbourne  wrote:
> > > If the memory region allocated by KFENCE had previously been poisoned,
> > > any validity checks done using kasan_byte_accessible() will fail. Fix
> > > it by unpoisoning the memory before using it as the pool region.
> > >
> > > Link: 
> > > https://linux-review.googlesource.com/id/I0af99e9f1c25eaf7e1ec295836b5d148d76940c5
> > > Signed-off-by: Peter Collingbourne 
> >
> > Thanks, at a high level this seems reasonable, because we always want
> > to ensure that KFENCE memory remains unpoisoned with KASAN on. FWIW I
> > subjected a config with KFENCE+KASAN (generic, SW_TAGS, and HW_TAGS)
> > to syzkaller testing and ran kfence_test:
> >
> >   Tested-by: Marco Elver 
> >
> >
> > However, it is unclear to me under which circumstances we actually
> > need this, i.e. something would grab some memblock memory, somehow
> > poison it, and then release the memory back during early boot (note,
> > kfence_alloc_pool() is called before slab setup). If we can somehow
> > understand what actually did this, perhaps it'd help tell us if this
> > actually needs fixing in KFENCE or it's the other thing that needs a
> > fix.
> >
> > Given all this is happening during really early boot, I'd expect no or
> > very few calls to kasan_poison() until kfence_alloc_pool() is called.
> > We can probably debug it more by having kasan_poison() do a "if
> > (!__kfence_pool) dump_stack();" somewhere. Can you try this on the
> > system where you can repro the problem? I tried this just now on the
> > latest mainline kernel, and saw 0 calls until kfence_alloc_pool().
>
> I looked into the issue some more, and it turned out that the memory
> wasn't getting poisoned by kasan_poison() but rather by the calls to
> kasan_map_populate() in kasan_init_shadow(). Starting with the patch
> "kasan: initialize shadow to TAG_INVALID for SW_TAGS",
> KASAN_SHADOW_INIT is set to 0xFE rather than 0xFF, which caused the
> failure. The Android kernel branch for 5.10 (and the downstream kernel
> I was working with) already have this patch, but it isn't in the
> mainline kernel yet.
>
> Now that I understand the cause of the issue, I can reproduce it using
> the KFENCE unit tests on a db845c board, using both the Android 5.10
> and mainline branches if I cherry-pick that change. Here's an example
> crash from the unit tests (the failure was originally also observed
> from ksize in the downstream kernel):
>
> [   46.692195][  T175] BUG: KASAN: invalid-access in test_krealloc+0x1c4/0xf98
> [   46.699282][  T175] Read of size 1 at addr ff80e9e7b000 by task
> kunit_try_catch/175
> [   46.707400][  T175] Pointer tag: [ff], memory tag: [fe]
> [   46.712710][  T175]
> [   46.714955][  T175] CPU: 4 PID: 175 Comm: kunit_try_catch Tainted:
> GB 5.12.0-rc5-mainline-09505-ga2ab5b26d445-dirty #1
> [   46.727193][  T175] Hardware name: Thundercomm Dragonboard 845c (DT)
> [   46.733636][  T175] Call trace:
> [   46.736841][  T175]  dump_backtrace+0x0/0x2f8
> [   46.741295][  T175]  show_stack+0x2c/0x3c
> [   46.745388][  T175]  dump_stack+0x124/0x1bc
> [   46.749668][  T175]  print_address_description+0x7c/0x308
> [   46.755178][  T175]  __kasan_report+0x1a8/0x398
> [   46.759816][  T175]  kasan_report+0x50/0x7c
> [   46.764103][  T175]  __kasan_check_byte+0x3c/0x54
> [   46.768916][  T175]  ksize+0x4c/0x94
> [   46.772573][  T175]  test_krealloc+0x1c4/0xf98
> [   46.777108][  T175]  kunit_try_run_case+0x94/0x1c4
> [   46.781990][  T175]  kunit_generic_run_threadfn_adapter+0x30/0x44
> [   46.788196][  T175]  kthread+0x20c/0x234
> [   46.792213][  T175]  ret_from_fork+0x10/0x30
>
> Since "kasan: initialize shadow to TAG_INVALID for SW_TAGS" hasn't
> landed in mainline yet, it seems like we should insert this patch
> before that one rather than adding a Fixes: tag.

Thanks for getting to the bottom of it.

However, given the above, I think we need to explain this in the
commit message (which also makes the dependency between these 2
patches clear) and add a comment above the new kasan_unpoison_range().
That is, if we still think this is the right fix -- I'm not entirely
sure it is.

Because what I gather from "kasan: initialize shadow to TAG_INVALID
for SW_TAGS", is the requirement that "0xFF pointer tag is a match-all
tag, it doesn't matter what tag the accessed memory has".

While KFENCE memory is accessible through the slab API, and in this
case ksize() calling kasan_check_byte() leading

Re: [PATCH] kfence: unpoison pool region before use

2021-04-03 Thread Marco Elver
On Sat, 3 Apr 2021 at 16:05, Andrey Konovalov  wrote:
...
> Which kasan_byte_accessible() call fails?
>
> KASAN checks shouldn't be performed for KFENCE objects. We have a
> number of is_kfence_address() checks in KASAN runtime, but maybe we're
> missing some. Perhaps, we should even move those checks into the
> high-level wrappers in include/linux/kasan.h.

Moving them into include/linux/kasan.h seems unnecessary and an easy
way to introduce unnecessary overhead. AFAIK, there should be no
difference between having them in the high-level wrappers and the
inner runtime functions. I think until we understand what is actually
going on and could thoroughly justify, I'd be opposed to larger
changes. The small patch here is innocent enough, but it'd still be
good to understand. (FWIW, I believe the issue was encountered with
SW_TAGS on a downstream kernel.)


Re: [PATCH] kfence: unpoison pool region before use

2021-04-03 Thread Marco Elver
On Sat, 3 Apr 2021 at 07:13, Peter Collingbourne  wrote:
> If the memory region allocated by KFENCE had previously been poisoned,
> any validity checks done using kasan_byte_accessible() will fail. Fix
> it by unpoisoning the memory before using it as the pool region.
>
> Link: 
> https://linux-review.googlesource.com/id/I0af99e9f1c25eaf7e1ec295836b5d148d76940c5
> Signed-off-by: Peter Collingbourne 

Thanks, at a high level this seems reasonable, because we always want
to ensure that KFENCE memory remains unpoisoned with KASAN on. FWIW I
subjected a config with KFENCE+KASAN (generic, SW_TAGS, and HW_TAGS)
to syzkaller testing and ran kfence_test:

  Tested-by: Marco Elver 


However, it is unclear to me under which circumstances we actually
need this, i.e. something would grab some memblock memory, somehow
poison it, and then release the memory back during early boot (note,
kfence_alloc_pool() is called before slab setup). If we can somehow
understand what actually did this, perhaps it'd help tell us if this
actually needs fixing in KFENCE or it's the other thing that needs a
fix.

Given all this is happening during really early boot, I'd expect no or
very few calls to kasan_poison() until kfence_alloc_pool() is called.
We can probably debug it more by having kasan_poison() do a "if
(!__kfence_pool) dump_stack();" somewhere. Can you try this on the
system where you can repro the problem? I tried this just now on the
latest mainline kernel, and saw 0 calls until kfence_alloc_pool().

Thanks,
-- Marco

> ---
>  mm/kfence/core.c | 12 +---
>  1 file changed, 9 insertions(+), 3 deletions(-)
>
> diff --git a/mm/kfence/core.c b/mm/kfence/core.c
> index d53c91f881a4..bb22b0cf77aa 100644
> --- a/mm/kfence/core.c
> +++ b/mm/kfence/core.c
> @@ -633,13 +633,19 @@ static DECLARE_DELAYED_WORK(kfence_timer, 
> toggle_allocation_gate);
>
>  void __init kfence_alloc_pool(void)
>  {
> +   void *pool;
> +
> if (!kfence_sample_interval)
> return;
>
> -   __kfence_pool = memblock_alloc(KFENCE_POOL_SIZE, PAGE_SIZE);
> -
> -   if (!__kfence_pool)
> +   pool = memblock_alloc(KFENCE_POOL_SIZE, PAGE_SIZE);
> +   if (!pool) {
> pr_err("failed to allocate pool\n");
> +   return;
> +   }
> +
> +   kasan_unpoison_range(pool, KFENCE_POOL_SIZE);
> +   __kfence_pool = pool;
>  }
>
>  void __init kfence_init(void)
> --
> 2.31.0.208.g409f899ff0-goog
>


Re: [PATCH v3 1/2] kunit: add a KUnit test for SLUB debugging functionality

2021-04-01 Thread Marco Elver
On Thu, 1 Apr 2021 at 21:04, Daniel Latypov  wrote:
...
> > > --- a/include/linux/slub_def.h
> > > +++ b/include/linux/slub_def.h
> > > @@ -133,6 +133,8 @@ struct kmem_cache {
> > >   unsigned int usersize;  /* Usercopy region size */
> > >
> > >   struct kmem_cache_node *node[MAX_NUMNODES];
> > > +
> > > + int errors; /* Number of errors in cache */
> >
> > So, I think it's bad design to add a new field 'errors', just for the
> > test. This will increase kmem_cache size for all builds, which is
> > unnecessary.
> >
> > Is there use to retrieve 'errors' elsewhere?
> >
> > While you could guard this with #ifdef CONFIG_SLUB_DEBUG or so, there's
> > a better design option if this is just for the KUnit test's benefit: use
> > kunit_resource.
> >
> > The way it'd work is that for each test (you can add a common init
> > function) you add a named resource, in this case just an 'int' I guess,
> > that slab would be able to retrieve if this test is being run.
> >
> > In the test somewhere, you could add something like this:
> >
> >
> > static struct kunit_resource resource;
> > static int slab_errors;
> >
> > ..
> >
> > static int test_init(struct kunit *test)
> > {
> > slab_errors = 0;
> > kunit_add_named_resource(test, NULL, NULL, ,
> >  "slab_errors", _errors);
> > return 0;
> > }
> >
> > .. tests now check slab_errors .
> >
> > and then in slub.c you'd have:
> >
> > #if IS_ENABLED(CONFIG_KUNIT)
> > static bool slab_add_kunit_errors(void)
> > {
> > struct kunit_resource *resource;
> >
> > if (likely(!current->kunit_test))
> > return false;
> > resource = kunit_find_named_resource(current->kunit_test, 
> > "slab_errors");
> > if (!resource)
> > return false;
> > (*(int *)resource->data)++;
> > kunit_put_resource(resource);

  return true;

was missing.

> > }
> > #else
> > static inline bool slab_add_kunit_errors(void) { return false; }
> > #endif
> >
> > And anywhere you want to increase the error count, you'd call
> > slab_add_kunit_errors().
> >
> > Another benefit of this approach is that if KUnit is disabled, there is
> > zero overhead and no additional code generated (vs. the current
> > approach).
>
> The resource approach looks really good, but...
> You'd be picking up a dependency on
> https://lore.kernel.org/linux-kselftest/20210311152314.3814916-2-dlaty...@google.com/
> current->kunit_test will always be NULL unless CONFIG_KASAN=y &&
> CONFIG_KUNIT=y at the moment.
> My patch drops the CONFIG_KASAN requirement and opens it up to all tests.

Oh, that's a shame, but hopefully it'll be in -next soon.

> At the moment, it's just waiting another look over from Brendan or David.
> Any ETA on that, folks? :)
>
> So if you don't want to get blocked on that for now, I think it's fine to add:
>   #ifdef CONFIG_SLUB_KUNIT_TEST
>   int errors;
>   #endif

Until kunit fixes setting current->kunit_test, a cleaner workaround
that would allow to do the patch with kunit_resource, is to just have
an .init/.exit function that sets it ("current->kunit_test = test;").
And then perhaps add a note ("FIXME: ...") to remove it once the above
patch has landed.

At least that way we get the least intrusive change for mm/slub.c, and
the test is the only thing that needs a 2-line patch to clean up
later.

Thanks,
-- Marco


Re: [PATCH v3 1/2] kunit: add a KUnit test for SLUB debugging functionality

2021-04-01 Thread Marco Elver
[Note, if you'd like me to see future versions, please Cc me, otherwise
it's unlikely I see it in time. Also add kunit-...@googlegroups.com if
perhaps a KUnit dev should have another look, too.]

On Wed, Mar 31, 2021 at 10:51AM +0200, glit...@gmail.com wrote:
> From: Oliver Glitta 
> 
> SLUB has resiliency_test() function which is hidden behind #ifdef
> SLUB_RESILIENCY_TEST that is not part of Kconfig, so nobody
> runs it. KUnit should be a proper replacement for it.
> 
> Try changing byte in redzone after allocation and changing
> pointer to next free node, first byte, 50th byte and redzone
> byte. Check if validation finds errors.
> 
> There are several differences from the original resiliency test:
> Tests create own caches with known state instead of corrupting
> shared kmalloc caches.
> 
> The corruption of freepointer uses correct offset, the original
> resiliency test got broken with freepointer changes.
> 
> Scratch changing random byte test, because it does not have
> meaning in this form where we need deterministic results.
> 
> Add new option CONFIG_SLUB_KUNIT_TEST in Kconfig.
> 
> Add a counter field "errors" to struct kmem_cache to count number
> of errors detected in cache.
> 
> Silence bug report in SLUB test. Add SLAB_SILENT_ERRORS debug flag.
> Add SLAB_SILENT_ERRORS flag to SLAB_NEVER_MERGE, SLAB_DEBUG_FLAGS,
> SLAB_FLAGS_PERMITTED macros.
> 
> Signed-off-by: Oliver Glitta 
> ---
> Changes since v2
> 
> Use bit operation & instead of logical && as reported by kernel test 
> robot and Dan Carpenter
> 
> Changes since v1
> 
> Conversion from kselftest to KUnit test suggested by Marco Elver.
> Error silencing.
> Error counting improvements. 
> 
>  include/linux/slab.h |   2 +
>  include/linux/slub_def.h |   2 +
>  lib/Kconfig.debug|   5 ++
>  lib/Makefile |   1 +
>  lib/test_slub.c  | 124 +++
>  mm/slab.h|   7 ++-
>  mm/slab_common.c |   2 +-
>  mm/slub.c|  64 +---
>  8 files changed, 184 insertions(+), 23 deletions(-)
>  create mode 100644 lib/test_slub.c
> 
> diff --git a/include/linux/slab.h b/include/linux/slab.h
> index 7ae604076767..ed1a5a64d028 100644
> --- a/include/linux/slab.h
> +++ b/include/linux/slab.h
> @@ -25,6 +25,8 @@
>   */
>  /* DEBUG: Perform (expensive) checks on alloc/free */
>  #define SLAB_CONSISTENCY_CHECKS  ((slab_flags_t __force)0x0100U)
> +/* DEBUG: Silent bug reports */
> +#define SLAB_SILENT_ERRORS   ((slab_flags_t __force)0x0200U)

This flag wouldn't be necessary if you do the design using
kunit_resource (see below).

(But perhaps I missed a conversation that said that this flag is
generally useful, but if so, it should probably be in a separate patch
justifying why it is required beyond the test.)

>  /* DEBUG: Red zone objs in a cache */
>  #define SLAB_RED_ZONE((slab_flags_t __force)0x0400U)
>  /* DEBUG: Poison objects */
> diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
> index dcde82a4434c..e4b51bb5bb83 100644
> --- a/include/linux/slub_def.h
> +++ b/include/linux/slub_def.h
> @@ -133,6 +133,8 @@ struct kmem_cache {
>   unsigned int usersize;  /* Usercopy region size */
>  
>   struct kmem_cache_node *node[MAX_NUMNODES];
> +
> + int errors; /* Number of errors in cache */

So, I think it's bad design to add a new field 'errors', just for the
test. This will increase kmem_cache size for all builds, which is
unnecessary.

Is there use to retrieve 'errors' elsewhere?

While you could guard this with #ifdef CONFIG_SLUB_DEBUG or so, there's
a better design option if this is just for the KUnit test's benefit: use
kunit_resource.

The way it'd work is that for each test (you can add a common init
function) you add a named resource, in this case just an 'int' I guess,
that slab would be able to retrieve if this test is being run.

In the test somewhere, you could add something like this:


static struct kunit_resource resource;
static int slab_errors;

..

static int test_init(struct kunit *test)
{
slab_errors = 0;
kunit_add_named_resource(test, NULL, NULL, ,
 "slab_errors", _errors);
return 0;
}

.. tests now check slab_errors .

and then in slub.c you'd have:

#if IS_ENABLED(CONFIG_KUNIT)
static bool slab_add_kunit_errors(void)
{
struct kunit_resource *resource;

if (likely(!current->kunit_test))
return false;
resource = kunit_find

Re: [PATCH] kasan: detect false-positives in tests

2021-03-31 Thread Marco Elver
On Wed, 31 Mar 2021 at 18:25, Andrey Konovalov  wrote:
>
> Currently, KASAN-KUnit tests can check that a particular annotated part
> of code causes a KASAN report. However, they do not check that no unwanted
> reports happen between the annotated parts.
>
> This patch implements these checks.
>
> It is done by setting report_data.report_found to false in
> kasan_test_init() and at the end of KUNIT_EXPECT_KASAN_FAIL() and then
> checking that it remains false at the beginning of
> KUNIT_EXPECT_KASAN_FAIL() and in kasan_test_exit().
>
> kunit_add_named_resource() call is moved to kasan_test_init(), and the
> value of fail_data.report_expected is kept as false in between
> KUNIT_EXPECT_KASAN_FAIL() annotations for consistency.
>
> Signed-off-by: Andrey Konovalov 

Reviewed-by: Marco Elver 

Thank you!

> ---
>  lib/test_kasan.c | 49 +++-
>  1 file changed, 28 insertions(+), 21 deletions(-)
>
> diff --git a/lib/test_kasan.c b/lib/test_kasan.c
> index d77c45edc7cd..bf9225002a7e 100644
> --- a/lib/test_kasan.c
> +++ b/lib/test_kasan.c
> @@ -54,6 +54,10 @@ static int kasan_test_init(struct kunit *test)
>
> multishot = kasan_save_enable_multi_shot();
> kasan_set_tagging_report_once(false);
> +   fail_data.report_found = false;
> +   fail_data.report_expected = false;
> +   kunit_add_named_resource(test, NULL, NULL, ,
> +   "kasan_data", _data);
> return 0;
>  }
>
> @@ -61,6 +65,7 @@ static void kasan_test_exit(struct kunit *test)
>  {
> kasan_set_tagging_report_once(true);
> kasan_restore_multi_shot(multishot);
> +   KUNIT_EXPECT_FALSE(test, fail_data.report_found);
>  }
>
>  /**
> @@ -78,28 +83,30 @@ static void kasan_test_exit(struct kunit *test)
>   * fields, it can reorder or optimize away the accesses to those fields.
>   * Use READ/WRITE_ONCE() for the accesses and compiler barriers around the
>   * expression to prevent that.
> + *
> + * In between KUNIT_EXPECT_KASAN_FAIL checks, fail_data.report_found is kept 
> as
> + * false. This allows detecting KASAN reports that happen outside of the 
> checks
> + * by asserting !fail_data.report_found at the start of 
> KUNIT_EXPECT_KASAN_FAIL
> + * and in kasan_test_exit.
>   */
> -#define KUNIT_EXPECT_KASAN_FAIL(test, expression) do { \
> -   if (IS_ENABLED(CONFIG_KASAN_HW_TAGS))   \
> -   migrate_disable();  \
> -   WRITE_ONCE(fail_data.report_expected, true);\
> -   WRITE_ONCE(fail_data.report_found, false);  \
> -   kunit_add_named_resource(test,  \
> -   NULL,   \
> -   NULL,   \
> -   ,  \
> -   "kasan_data", _data);  \
> -   barrier();  \
> -   expression; \
> -   barrier();  \
> -   KUNIT_EXPECT_EQ(test,   \
> -   READ_ONCE(fail_data.report_expected),   \
> -   READ_ONCE(fail_data.report_found)); \
> -   if (IS_ENABLED(CONFIG_KASAN_HW_TAGS)) { \
> -   if (READ_ONCE(fail_data.report_found))  \
> -   kasan_enable_tagging(); \
> -   migrate_enable();   \
> -   }   \
> +#define KUNIT_EXPECT_KASAN_FAIL(test, expression) do { \
> +   if (IS_ENABLED(CONFIG_KASAN_HW_TAGS))   \
> +   migrate_disable();  \
> +   KUNIT_EXPECT_FALSE(test, READ_ONCE(fail_data.report_found));\
> +   WRITE_ONCE(fail_data.report_expected, true);\
> +   barrier();  \
> +   expression; \
> +   barrier();  \
> +   KUNIT_EXPECT_EQ(test,   \
> +   READ_ONCE(fail_data.report_expected),   \
> +   READ_ONCE(fail_data.report_found)); \
> +   if (IS_ENABLED(CONFIG_KASAN_HW_TAGS)) { \
> +   if (READ_ONCE(fail_data.report_found)) 

Re: [PATCH v3 06/11] perf: Add support for SIGTRAP on perf events

2021-03-31 Thread Marco Elver
On Wed, 31 Mar 2021 at 16:51, Peter Zijlstra  wrote:
> On Wed, Mar 31, 2021 at 02:32:58PM +0200, Marco Elver wrote:
> > On Mon, 29 Mar 2021 at 14:07, Peter Zijlstra  wrote:
> >
> > > (and we might already have a problem on some architectures where there
> > > can be significant time between these due to not having
> > > arch_irq_work_raise(), so ideally we ought to double check current in
> > > your case)
> >
> > I missed this bit -- just to verify: here we want to check that
> > event->ctx->task == current, in case the the irq_work runs when the
> > current task has already been replaced. Correct?
>
> Yeah, just not sure what a decent failure would be, silent ignore seems
> undesired, maybe WARN and archs that can trigger it get to fix it ?

I'll go with a WARN and add a comment.

This also revealed there should be a requirement that sigtrap events
must be associated with a task (syzkaller managed to trigger the
warning for cpu events).

Thanks,
-- Marco


Re: [PATCH v3 06/11] perf: Add support for SIGTRAP on perf events

2021-03-31 Thread Marco Elver
On Mon, 29 Mar 2021 at 14:07, Peter Zijlstra  wrote:

> (and we might already have a problem on some architectures where there
> can be significant time between these due to not having
> arch_irq_work_raise(), so ideally we ought to double check current in
> your case)

I missed this bit -- just to verify: here we want to check that
event->ctx->task == current, in case the the irq_work runs when the
current task has already been replaced. Correct?

Thanks,
-- Marco


[PATCH mm] kfence, x86: fix preemptible warning on KPTI-enabled systems

2021-03-30 Thread Marco Elver
On systems with KPTI enabled, we can currently observe the following warning:

  BUG: using smp_processor_id() in preemptible
  caller is invalidate_user_asid+0x13/0x50
  CPU: 6 PID: 1075 Comm: dmesg Not tainted 5.12.0-rc4-gda4a2b1a5479-kfence_1+ #1
  Hardware name: Hewlett-Packard HP Pro 3500 Series/2ABF, BIOS 8.11 10/24/2012
  Call Trace:
   dump_stack+0x7f/0xad
   check_preemption_disabled+0xc8/0xd0
   invalidate_user_asid+0x13/0x50
   flush_tlb_one_kernel+0x5/0x20
   kfence_protect+0x56/0x80
   ...

While it normally makes sense to require preemption to be off, so that
the expected CPU's TLB is flushed and not another, in our case it really
is best-effort (see comments in kfence_protect_page()).

Avoid the warning by disabling preemption around flush_tlb_one_kernel().

Link: https://lore.kernel.org/lkml/ygidbaboelggm...@elver.google.com/
Reported-by: Tomi Sarvela 
Signed-off-by: Marco Elver 
---
 arch/x86/include/asm/kfence.h | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/kfence.h b/arch/x86/include/asm/kfence.h
index 97bbb4a9083a..05b48b33baf0 100644
--- a/arch/x86/include/asm/kfence.h
+++ b/arch/x86/include/asm/kfence.h
@@ -56,8 +56,13 @@ static inline bool kfence_protect_page(unsigned long addr, 
bool protect)
else
set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT));
 
-   /* Flush this CPU's TLB. */
+   /*
+* Flush this CPU's TLB, assuming whoever did the allocation/free is
+* likely to continue running on this CPU.
+*/
+   preempt_disable();
flush_tlb_one_kernel(addr);
+   preempt_enable();
return true;
 }
 
-- 
2.31.0.291.g576ba9dcdaf-goog



Re: I915 CI-run with kfence enabled, issues found

2021-03-29 Thread Marco Elver
On Mon, 29 Mar 2021 at 23:47, Andy Lutomirski  wrote:
>
>
> > On Mar 29, 2021, at 2:34 PM, Marco Elver  wrote:
> >
> > On Mon, 29 Mar 2021 at 23:03, Dave Hansen  wrote:
> >>> On 3/29/21 10:45 AM, Marco Elver wrote:
> >>>> On Mon, 29 Mar 2021 at 19:32, Dave Hansen  wrote:
> >>> Doing it to all CPUs is too expensive, and we can tolerate this being
> >>> approximate (nothing bad will happen, KFENCE might just miss a bug and
> >>> that's ok).
> >> ...
> >>>> BTW, the preempt checks in flush_tlb_one_kernel() are dependent on KPTI
> >>>> being enabled.  That's probably why you don't see this everywhere.  We
> >>>> should probably have unconditional preempt checks in there.
> >>>
> >>> In which case I'll add a preempt_disable/enable() pair to
> >>> kfence_protect_page() in arch/x86/include/asm/kfence.h.
> >>
> >> That sounds sane to me.  I'd just plead that the special situation (not
> >> needing deterministic TLB flushes) is obvious.  We don't want any folks
> >> copying this code.
> >>
> >> BTW, I know you want to avoid the cost of IPIs, but have you considered
> >> any other low-cost ways to get quicker TLB flushes?  For instance, you
> >> could loop over all CPUs and set cpu_tlbstate.invalidate_other=1.  That
> >> would induce a context switch at the next context switch without needing
> >> an IPI.
> >
> > This is interesting. And it seems like it would work well for our
> > usecase. Ideally we should only flush entries related to the page we
> > changed. But it seems invalidate_other would flush the entire TLB.
> >
> > With PTI, flush_tlb_one_kernel() already does that for the current
> > CPU, but now we'd flush entire TLBs for all CPUs and even if PTI is
> > off.
> >
> > Do you have an intuition for how much this would affect large
> > multi-socket systems? I currently can't quite say, and would err on
> > the side of caution.
>
> Flushing the kernel TLB for all addresses
> Is rather pricy. ISTR 600 cycles on Skylake, not to mention the cost of 
> losing the TLB.  How common is this?

AFAIK, invalidate_other resets the asid, so it's not explicit and
perhaps cheaper?

In any case, if we were to do this, it'd be based on the sample
interval of KFENCE, which can be as low as 1ms. But this is a
production debugging feature, so the target machines are not test
machines. For those production deployments we'd be looking at every
~500ms. But I know of other deployments that use <100ms.

Doesn't sound like much, but as you say, I also worry a bit about
losing the TLB across >100 CPUs even if it's every 500ms.

Thanks,
-- Marco


Re: I915 CI-run with kfence enabled, issues found

2021-03-29 Thread Marco Elver
On Mon, 29 Mar 2021 at 23:03, Dave Hansen  wrote:
> On 3/29/21 10:45 AM, Marco Elver wrote:
> > On Mon, 29 Mar 2021 at 19:32, Dave Hansen  wrote:
> > Doing it to all CPUs is too expensive, and we can tolerate this being
> > approximate (nothing bad will happen, KFENCE might just miss a bug and
> > that's ok).
> ...
> >> BTW, the preempt checks in flush_tlb_one_kernel() are dependent on KPTI
> >> being enabled.  That's probably why you don't see this everywhere.  We
> >> should probably have unconditional preempt checks in there.
> >
> > In which case I'll add a preempt_disable/enable() pair to
> > kfence_protect_page() in arch/x86/include/asm/kfence.h.
>
> That sounds sane to me.  I'd just plead that the special situation (not
> needing deterministic TLB flushes) is obvious.  We don't want any folks
> copying this code.
>
> BTW, I know you want to avoid the cost of IPIs, but have you considered
> any other low-cost ways to get quicker TLB flushes?  For instance, you
> could loop over all CPUs and set cpu_tlbstate.invalidate_other=1.  That
> would induce a context switch at the next context switch without needing
> an IPI.

This is interesting. And it seems like it would work well for our
usecase. Ideally we should only flush entries related to the page we
changed. But it seems invalidate_other would flush the entire TLB.

With PTI, flush_tlb_one_kernel() already does that for the current
CPU, but now we'd flush entire TLBs for all CPUs and even if PTI is
off.

Do you have an intuition for how much this would affect large
multi-socket systems? I currently can't quite say, and would err on
the side of caution.

Thanks,
-- Marco


Re: [PATCH v3 06/11] perf: Add support for SIGTRAP on perf events

2021-03-29 Thread Marco Elver
On Mon, 29 Mar 2021 at 16:27, Oleg Nesterov  wrote:
> On 03/29, Peter Zijlstra wrote:
> >
> > On Thu, Mar 25, 2021 at 09:14:39AM +0100, Marco Elver wrote:
> > > @@ -6395,6 +6395,13 @@ static void perf_sigtrap(struct perf_event *event)
> > >  {
> > > struct kernel_siginfo info;
> > >
> > > +   /*
> > > +* This irq_work can race with an exiting task; bail out if sighand 
> > > has
> > > +* already been released in release_task().
> > > +*/
> > > +   if (!current->sighand)
> > > +   return;
>
> This is racy. If "current" has already passed exit_notify(), current->parent
> can do release_task() and destroy current->sighand right after the check.
>
> > Urgh.. I'm not entirely sure that check is correct, but I always forget
> > the rules with signal. It could be we ought to be testing PF_EXISTING
> > instead.
>
> Agreed, PF_EXISTING check makes more sense in any case, the exiting task
> can't receive the signal anyway.

So, per off-list discussion, it appears that I should ask to clarify:
PF_EXISTING or PF_EXITING?

It appears that PF_EXISTING is what's being suggested, whereas it has
not been mentioned anywhere, nor are its semantics clear. If it is not
simply the negation of PF_EXITING, what are its semantics? And why do
we need it in the case here (instead of something else that already
exists)?

Thanks,
-- Marco


Re: I915 CI-run with kfence enabled, issues found

2021-03-29 Thread Marco Elver
On Mon, 29 Mar 2021 at 19:32, Dave Hansen  wrote:
>
> On 3/29/21 9:40 AM, Marco Elver wrote:
> > It looks like the code path from flush_tlb_one_kernel() to
> > invalidate_user_asid()'s this_cpu_ptr() has several feature checks, so
> > probably some feature difference between systems where it triggers and
> > it doesn't.
> >
> > As far as I'm aware, there is no restriction on where
> > flush_tlb_one_kernel() is called. We could of course guard it but I
> > think that's wrong.
> >
> > Other than that, I hope the x86 maintainers know what's going on here.
> >
> > Just for reference, the stack traces in the above logs start with:
> >
> > | <3> [31.556004] BUG: using smp_processor_id() in preemptible [] 
> > code: dmesg/1075
> > | <4> [31.556070] caller is invalidate_user_asid+0x13/0x50
> > | <4> [31.556078] CPU: 6 PID: 1075 Comm: dmesg Not tainted 
> > 5.12.0-rc4-gda4a2b1a5479-kfence_1+ #1
> > | <4> [31.556081] Hardware name: Hewlett-Packard HP Pro 3500 Series/2ABF, 
> > BIOS 8.11 10/24/2012
> > | <4> [31.556084] Call Trace:
> > | <4> [31.556088]  dump_stack+0x7f/0xad
> > | <4> [31.556097]  check_preemption_disabled+0xc8/0xd0
> > | <4> [31.556104]  invalidate_user_asid+0x13/0x50
> > | <4> [31.556109]  flush_tlb_one_kernel+0x5/0x20
> > | <4> [31.556113]  kfence_protect+0x56/0x80
> > | ...
>
> Our naming here isn't great.
>
> But, the "one" in flush_tlb_one_kernel() really refers to two "ones":
> 1. Flush one single address
> 2. Flush that address from one CPU's TLB
>
> The reason preempt needs to be off is that it doesn't make any sense to
> flush one TLB entry from a "random" CPU.  It only makes sense to flush
> it when preempt is disabled and you *know* which CPU's TLB you're flushing.

Thanks for the rationale behind needing preempt off.

Though in our case it really is best-effort, as long as we hit the CPU
of the currently running task most of the time.

Doing it to all CPUs is too expensive, and we can tolerate this being
approximate (nothing bad will happen, KFENCE might just miss a bug and
that's ok).

> I think kfence needs to be using flush_tlb_kernel_range().  That does
> all the IPI fanciness to flush the TLBs on *ALL* CPUs, not just the
> current one.

The other problem is that this code can be called from interrupts.
This is already documented in arch/x86/include/asm/kfence.h

> BTW, the preempt checks in flush_tlb_one_kernel() are dependent on KPTI
> being enabled.  That's probably why you don't see this everywhere.  We
> should probably have unconditional preempt checks in there.

In which case I'll add a preempt_disable/enable() pair to
kfence_protect_page() in arch/x86/include/asm/kfence.h.

Thanks,
-- Marco


Re: I915 CI-run with kfence enabled, issues found

2021-03-29 Thread Marco Elver
[+Cc x86 maintainers]

On Mon, Mar 29, 2021 at 11:11AM +, Sarvela, Tomi P wrote:
> Hello,
> 
> I'm Tomi Sarvela, maintainer and original creator of linux i915-CI:
> https://intel-gfx-ci.01.org/
> 
> I got a hint from Martin Peres about kfence functionality in kernel, and it 
> looked
> something we'd like to enable in future CI runs so I made a trial run on 
> DRM-Tip.
> We've had regular KASAN-enabled runs, so the expectation was that there
> wouldn't be too many new problems exposed.
> 
> On this run two issues were found, where one is clearly kernel (GUC) issue,
> but another looked a lot like kfence issue on old platforms. Affected
> were IVB, SNB and ILK, with bug signature being:
> 
> <3> [31.556004] BUG: using smp_processor_id() in preemptible [] code: 
> ...
> <4> [31.556070] caller is invalidate_user_asid+0x13/0x50
> 
> I'm not a kernel developer myself, so I can't make hard assertions
> where the issue originates. In comparison to kernel without kfence,
> it looks like the newly enabled code is the cause because the
> "BUG: KFENCE" signature is missing from the trace
> 
> Can someone take a look at the traces and verify if the kfence issue
> exists and is not related to the rest of the kernel? 
> 
> If there is an issue tracker, I can add this information there.
> 
> Example traces:
> https://intel-gfx-ci.01.org/tree/drm-tip/kfence_1/fi-ivb-3770/igt@gem_ctx_cre...@basic-files.html
> 
> https://intel-gfx-ci.01.org/tree/drm-tip/kfence_1/fi-snb-2520m/igt@gem_ctx_cre...@basic-files.html
> 
> https://intel-gfx-ci.01.org/tree/drm-tip/kfence_1/fi-ilk-650/igt@gem_exec_cre...@basic.html
> 
> Kfence-exposed possible GUC issue:
> https://intel-gfx-ci.01.org/tree/drm-tip/kfence_1/fi-kbl-guc/igt@kms_addfb_ba...@addfb25-modifier-no-flag.html
> 
> All results can be seen at:
> https://intel-gfx-ci.01.org/tree/drm-tip/kfence_1/index.html
> 
> CI_DRM_9910 is recent DRM-Tip commit without -rc5 pulled in yet.
> kfence_1 is same commit with kfence defaults turned on:
[...]

It looks like the code path from flush_tlb_one_kernel() to
invalidate_user_asid()'s this_cpu_ptr() has several feature checks, so
probably some feature difference between systems where it triggers and
it doesn't.

As far as I'm aware, there is no restriction on where
flush_tlb_one_kernel() is called. We could of course guard it but I
think that's wrong.

Other than that, I hope the x86 maintainers know what's going on here.

Just for reference, the stack traces in the above logs start with:

| <3> [31.556004] BUG: using smp_processor_id() in preemptible [] code: 
dmesg/1075
| <4> [31.556070] caller is invalidate_user_asid+0x13/0x50
| <4> [31.556078] CPU: 6 PID: 1075 Comm: dmesg Not tainted 
5.12.0-rc4-gda4a2b1a5479-kfence_1+ #1
| <4> [31.556081] Hardware name: Hewlett-Packard HP Pro 3500 Series/2ABF, BIOS 
8.11 10/24/2012
| <4> [31.556084] Call Trace:
| <4> [31.556088]  dump_stack+0x7f/0xad
| <4> [31.556097]  check_preemption_disabled+0xc8/0xd0
| <4> [31.556104]  invalidate_user_asid+0x13/0x50
| <4> [31.556109]  flush_tlb_one_kernel+0x5/0x20
| <4> [31.556113]  kfence_protect+0x56/0x80
|   ...

Thanks,
-- Marco


Re: [PATCH v3 06/11] perf: Add support for SIGTRAP on perf events

2021-03-29 Thread Marco Elver
On Mon, 29 Mar 2021 at 16:27, Oleg Nesterov  wrote:
> On 03/29, Peter Zijlstra wrote:
> >
> > On Thu, Mar 25, 2021 at 09:14:39AM +0100, Marco Elver wrote:
> > > @@ -6395,6 +6395,13 @@ static void perf_sigtrap(struct perf_event *event)
> > >  {
> > > struct kernel_siginfo info;
> > >
> > > +   /*
> > > +* This irq_work can race with an exiting task; bail out if sighand 
> > > has
> > > +* already been released in release_task().
> > > +*/
> > > +   if (!current->sighand)
> > > +   return;
>
> This is racy. If "current" has already passed exit_notify(), current->parent
> can do release_task() and destroy current->sighand right after the check.
>
> > Urgh.. I'm not entirely sure that check is correct, but I always forget
> > the rules with signal. It could be we ought to be testing PF_EXISTING
> > instead.
>
> Agreed, PF_EXISTING check makes more sense in any case, the exiting task
> can't receive the signal anyway.

Thanks for confirming. I'll switch to just checking PF_EXITING
(PF_EXISTING does not exist :-)).

Thanks,
-- Marco


Re: [PATCH v3 01/11] perf: Rework perf_event_exit_event()

2021-03-25 Thread Marco Elver
On Thu, Mar 25, 2021 at 05:17PM +0100, Marco Elver wrote:
[...]
> > syzkaller found a crash with stack trace pointing at changes in this
> > patch. Can't tell if this is an old issue or introduced in this series.
> 
> Yay, I found a reproducer. v5.12-rc4 is good, and sadly with this patch only 
> we
> crash. :-/
> 
> Here's a stacktrace with just this patch applied:
> 
> | BUG: kernel NULL pointer dereference, address: 07af
[...]
> | RIP: 0010:task_pid_ptr kernel/pid.c:324 [inline]
> | RIP: 0010:__task_pid_nr_ns+0x112/0x240 kernel/pid.c:500
[...]
> | Call Trace:
> |  perf_event_pid_type kernel/events/core.c:1412 [inline]
> |  perf_event_pid kernel/events/core.c:1421 [inline]
> |  perf_event_read_event+0x78/0x1d0 kernel/events/core.c:7406
> |  sync_child_event kernel/events/core.c:12404 [inline]
> |  perf_child_detach kernel/events/core.c:2223 [inline]
> |  __perf_remove_from_context+0x14d/0x280 kernel/events/core.c:2359
> |  perf_remove_from_context+0x9f/0xf0 kernel/events/core.c:2395
> |  perf_event_exit_event kernel/events/core.c:12442 [inline]
> |  perf_event_exit_task_context kernel/events/core.c:12523 [inline]
> |  perf_event_exit_task+0x276/0x4c0 kernel/events/core.c:12556
> |  do_exit+0x4cd/0xed0 kernel/exit.c:834
> |  do_group_exit+0x4d/0xf0 kernel/exit.c:922
> |  get_signal+0x1d2/0xf30 kernel/signal.c:2777
> |  arch_do_signal_or_restart+0xf7/0x750 arch/x86/kernel/signal.c:789
> |  handle_signal_work kernel/entry/common.c:147 [inline]
> |  exit_to_user_mode_loop kernel/entry/common.c:171 [inline]
> |  exit_to_user_mode_prepare+0x113/0x190 kernel/entry/common.c:208
> |  irqentry_exit_to_user_mode+0x6/0x30 kernel/entry/common.c:314
> |  asm_exc_general_protection+0x1e/0x30 arch/x86/include/asm/idtentry.h:571

I spun up gdb, and it showed me this:

| #0  perf_event_read_event (event=event@entry=0x888107cd5000, 
task=task@entry=0x)
| at kernel/events/core.c:7397
^^^ 
TASK_TOMBSTONE
| #1  0x811fc9cd in sync_child_event (child_event=0x888107cd5000) 
at kernel/events/core.c:12404
| #2  perf_child_detach (event=0x888107cd5000) at kernel/events/core.c:2223
| #3  __perf_remove_from_context (event=event@entry=0x888107cd5000, 
cpuctx=cpuctx@entry=0x88842fdf0c00,
| ctx=ctx@entry=0x8881073cb800, info=info@entry=0x3 
) at kernel/events/core.c:2359
| #4  0x811fcb9f in perf_remove_from_context 
(event=event@entry=0x888107cd5000, flags=flags@entry=3)
| at kernel/events/core.c:2395
| #5  0x81204526 in perf_event_exit_event (ctx=0x8881073cb800, 
event=0x888107cd5000)
| at kernel/events/core.c:12442
| #6  perf_event_exit_task_context (ctxn=0, child=0x88810531a200) at 
kernel/events/core.c:12523
| #7  perf_event_exit_task (child=0x88810531a200) at 
kernel/events/core.c:12556
| #8  0x8108838d in do_exit (code=code@entry=11) at kernel/exit.c:834
| #9  0x81088e4d in do_group_exit (exit_code=11) at kernel/exit.c:922

and therefore synthesized this fix on top:

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 57de8d436efd..e77294c7e654 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -12400,7 +12400,7 @@ static void sync_child_event(struct perf_event 
*child_event)
if (child_event->attr.inherit_stat) {
struct task_struct *task = child_event->ctx->task;
 
-   if (task)
+   if (task && task != TASK_TOMBSTONE)
perf_event_read_event(child_event, task);
}
 
which fixes the problem. My guess is that the parent and child are both
racing to exit?

Does that make any sense?

Thanks,
-- Marco


Re: [PATCH v3 01/11] perf: Rework perf_event_exit_event()

2021-03-25 Thread Marco Elver
On Thu, Mar 25, 2021 at 11:17AM +0100, Marco Elver wrote:
> On Wed, Mar 24, 2021 at 12:24PM +0100, Marco Elver wrote:
> > From: Peter Zijlstra 
> > 
> > Make perf_event_exit_event() more robust, such that we can use it from
> > other contexts. Specifically the up and coming remove_on_exec.
> > 
> > For this to work we need to address a few issues. Remove_on_exec will
> > not destroy the entire context, so we cannot rely on TASK_TOMBSTONE to
> > disable event_function_call() and we thus have to use
> > perf_remove_from_context().
> > 
> > When using perf_remove_from_context(), there's two races to consider.
> > The first is against close(), where we can have concurrent tear-down
> > of the event. The second is against child_list iteration, which should
> > not find a half baked event.
> > 
> > To address this, teach perf_remove_from_context() to special case
> > !ctx->is_active and about DETACH_CHILD.
> > 
> > Signed-off-by: Peter Zijlstra (Intel) 
> > Signed-off-by: Marco Elver 
> > ---
> > v3:
> > * New dependency for series:
> >   https://lkml.kernel.org/r/YFn/i3akf+toj...@hirez.programming.kicks-ass.net
> > ---
> 
> syzkaller found a crash with stack trace pointing at changes in this
> patch. Can't tell if this is an old issue or introduced in this series.

Yay, I found a reproducer. v5.12-rc4 is good, and sadly with this patch only we
crash. :-/

Here's a stacktrace with just this patch applied:

| BUG: kernel NULL pointer dereference, address: 07af
| #PF: supervisor read access in kernel mode
| #PF: error_code(0x) - not-present page
| PGD 0 P4D 0
| Oops:  [#1] PREEMPT SMP PTI
| CPU: 7 PID: 465 Comm: a.out Not tainted 5.12.0-rc4+ #25
| Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 
04/01/2014
| RIP: 0010:task_pid_ptr kernel/pid.c:324 [inline]
| RIP: 0010:__task_pid_nr_ns+0x112/0x240 kernel/pid.c:500
| Code: e8 13 55 07 00 e8 1e a6 0e 00 48 c7 c6 83 1e 0b 81 48 c7 c7 a0 2e d5 82 
e8 4b 08 04 00 44 89 e0 5b 5d 41 5c c3 e8 fe a5 0e 00 <48> 8b 85 b0 07 00 00 4a 
8d ac e0 98 01 00 00 e9 5a ff ff ff e8 e5
| RSP: :c90001b73a60 EFLAGS: 00010093
| RAX:  RBX: 82c69820 RCX: 810b1eb2
| RDX: 888108d143c0 RSI:  RDI: 8299ccc6
| RBP:  R08: 0001 R09: 
| R10: 888108d14db8 R11:  R12: 0001
| R13:  R14:  R15: 888108e05240
| FS:  () GS:88842fdc() knlGS:
| CS:  0010 DS:  ES:  CR0: 80050033
| CR2: 07af CR3: 02c22002 CR4: 00770ee0
| DR0:  DR1:  DR2: 
| DR3:  DR6: fffe0ff0 DR7: 0400
| PKRU: 5554
| Call Trace:
|  perf_event_pid_type kernel/events/core.c:1412 [inline]
|  perf_event_pid kernel/events/core.c:1421 [inline]
|  perf_event_read_event+0x78/0x1d0 kernel/events/core.c:7406
|  sync_child_event kernel/events/core.c:12404 [inline]
|  perf_child_detach kernel/events/core.c:2223 [inline]
|  __perf_remove_from_context+0x14d/0x280 kernel/events/core.c:2359
|  perf_remove_from_context+0x9f/0xf0 kernel/events/core.c:2395
|  perf_event_exit_event kernel/events/core.c:12442 [inline]
|  perf_event_exit_task_context kernel/events/core.c:12523 [inline]
|  perf_event_exit_task+0x276/0x4c0 kernel/events/core.c:12556
|  do_exit+0x4cd/0xed0 kernel/exit.c:834
|  do_group_exit+0x4d/0xf0 kernel/exit.c:922
|  get_signal+0x1d2/0xf30 kernel/signal.c:2777
|  arch_do_signal_or_restart+0xf7/0x750 arch/x86/kernel/signal.c:789
|  handle_signal_work kernel/entry/common.c:147 [inline]
|  exit_to_user_mode_loop kernel/entry/common.c:171 [inline]
|  exit_to_user_mode_prepare+0x113/0x190 kernel/entry/common.c:208
|  irqentry_exit_to_user_mode+0x6/0x30 kernel/entry/common.c:314
|  asm_exc_general_protection+0x1e/0x30 arch/x86/include/asm/idtentry.h:571

Attached is a C reproducer of the syzkaller program that crashes us.

Thanks,
-- Marco
// autogenerated by syzkaller (https://github.com/google/syzkaller)
/*
Generated from this syzkaller program:

clone(0x88004400, 0x0, 0x0, 0x0, 0x0)
perf_event_open(&(0x7f0003c0)={0x4, 0x70, 0x40, 0x1, 0x3, 0x1, 0x0, 0x6, 0x10001, 0x0, 0x0, 0x1, 0x0, 0x1, 0x1, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0, 0x1, 0x1, 0x1, 0x0, 0x1, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x8001, 0x2, @perf_bp={&(0x7f000380), 0xd}, 0x1000, 0x6, 0x0, 0x4, 0x1, 0x4, 0x8}, 0x0, 0x, 0x, 0x1)
clone(0x8000, &(0x7f000200)="3017248985480229c715f01f2776139977f49770d8181077dce816423a929ed5e59bf26ca77f2ba311b783dda29870d621ff2394424d9c799be5fa29f1ee42102645b56fd9727401d

Re: [PATCH v3 07/11] perf: Add breakpoint information to siginfo on SIGTRAP

2021-03-25 Thread Marco Elver
On Thu, 25 Mar 2021 at 15:18, Ingo Molnar  wrote:
>
> * Dmitry Vyukov  wrote:
>
> > On Wed, Mar 24, 2021 at 3:05 PM Marco Elver  wrote:
> > >
> > > On Wed, 24 Mar 2021 at 15:01, Peter Zijlstra  wrote:
> > > >
> > > > One last try, I'll leave it alone now, I promise :-)
> > >
> > > This looks like it does what you suggested, thanks! :-)
> > >
> > > I'll still need to think about it, because of the potential problem
> > > with modify-signal-races and what the user's synchronization story
> > > would look like then.
> >
> > I agree that this looks inherently racy. The attr can't be allocated
> > on stack, user synchronization may be tricky and expensive. The API
> > may provoke bugs and some users may not even realize the race problem.
>
> Yeah, so why cannot we allocate enough space from the signal handler
> user-space stack and put the attr there, and point to it from
> sig_info?
>
> The idea would be to create a stable, per-signal snapshot of whatever
> the perf_attr state is at the moment the event happens and the signal
> is generated - which is roughly what user-space wants, right?

I certainly couldn't say how feasible this is. Is there infrastructure
in place to do this? Or do we have to introduce support for stashing
things on the signal stack?

>From what we can tell, the most flexible option though appears to be
just some user settable opaque data in perf_event_attr, that is copied
to siginfo. It'd allow user space to store a pointer or a hash/key, or
just encode the relevant information it wants; but could also go
further, and add information beyond perf_event_attr, such as things
like a signal receiver filter (e.g. task ID or set of threads which
should process the signal etc.).

So if there's no strong objection to the additional field in
perf_event_attr, I think it'll give us the simplest and most flexible
option.

Thanks,
-- Marco

> Thanks,
>
> Ingo


Re: [PATCH v3 01/11] perf: Rework perf_event_exit_event()

2021-03-25 Thread Marco Elver
On Wed, Mar 24, 2021 at 12:24PM +0100, Marco Elver wrote:
> From: Peter Zijlstra 
> 
> Make perf_event_exit_event() more robust, such that we can use it from
> other contexts. Specifically the up and coming remove_on_exec.
> 
> For this to work we need to address a few issues. Remove_on_exec will
> not destroy the entire context, so we cannot rely on TASK_TOMBSTONE to
> disable event_function_call() and we thus have to use
> perf_remove_from_context().
> 
> When using perf_remove_from_context(), there's two races to consider.
> The first is against close(), where we can have concurrent tear-down
> of the event. The second is against child_list iteration, which should
> not find a half baked event.
> 
> To address this, teach perf_remove_from_context() to special case
> !ctx->is_active and about DETACH_CHILD.
> 
> Signed-off-by: Peter Zijlstra (Intel) 
> Signed-off-by: Marco Elver 
> ---
> v3:
> * New dependency for series:
>   https://lkml.kernel.org/r/YFn/i3akf+toj...@hirez.programming.kicks-ass.net
> ---

syzkaller found a crash with stack trace pointing at changes in this
patch. Can't tell if this is an old issue or introduced in this series.

It looks like task_pid_ptr() wants to access task_struct::signal, but
the task_struct pointer is NULL.

Any ideas?

general protection fault, probably for non-canonical address 
0xdc000103:  [#1] PREEMPT SMP KASAN
KASAN: null-ptr-deref in range [0x0818-0x081f]
CPU: 2 PID: 15084 Comm: syz-executor.1 Not tainted 5.12.0-rc4+ #5
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014
RIP: 0010:task_pid_ptr kernel/pid.c:325 [inline]
RIP: 0010:__task_pid_nr_ns+0x137/0x3e0 kernel/pid.c:500
Code: 8b 75 00 eb 08 e8 59 28 29 00 45 31 f6 31 ff 44 89 fe e8 5c 2c 29 00 45 
85 ff 74 49 48 81 c3 20 08 00 00 48 89 d8 48 c1 e8 03 <42> 80 3c 20 00 74 08 48 
89 df e8 aa 03 6d 00 48 8b 2b 44 89 fb bf
RSP: 0018:c9000c76f6d0 EFLAGS: 00010007
RAX: 0103 RBX: 081f RCX: 8880717d8000
RDX: 8880717d8000 RSI: 0001 RDI: 
RBP: 0001 R08: 814fe814 R09: fbfff1f296b1
R10: fbfff1f296b1 R11:  R12: dc00
R13: 11100e6dfc5c R14: 888057fba108 R15: 0001
FS:  () GS:88802cf0() knlGS:
CS:  0010 DS:  ES:  CR0: 80050033
CR2: 7ffcc3b05bc0 CR3: 40ac CR4: 00750ee0
DR0:  DR1:  DR2: 
DR3:  DR6: fffe0ff0 DR7: 0600
PKRU: 5554
Call Trace:
 perf_event_pid_type kernel/events/core.c:1412 [inline]
 perf_event_pid kernel/events/core.c:1421 [inline]
 perf_event_read_event kernel/events/core.c:7511 [inline]
 sync_child_event kernel/events/core.c:12521 [inline]
 perf_child_detach kernel/events/core.c:2223 [inline]
 __perf_remove_from_context+0x569/0xd30 kernel/events/core.c:2359
 perf_remove_from_context+0x19d/0x220 kernel/events/core.c:2395
 perf_event_exit_event+0x76/0x950 kernel/events/core.c:12559
 perf_event_exit_task_context kernel/events/core.c:12640 [inline]
 perf_event_exit_task+0x715/0xa40 kernel/events/core.c:12673
 do_exit+0x6c2/0x2290 kernel/exit.c:834
 do_group_exit+0x168/0x2d0 kernel/exit.c:922
 get_signal+0x1734/0x1ef0 kernel/signal.c:2779
 arch_do_signal_or_restart+0x41/0x620 arch/x86/kernel/signal.c:789
 handle_signal_work kernel/entry/common.c:147 [inline]
 exit_to_user_mode_loop kernel/entry/common.c:171 [inline]
 exit_to_user_mode_prepare+0xac/0x1e0 kernel/entry/common.c:208
 irqentry_exit_to_user_mode+0x6/0x40 kernel/entry/common.c:314
 exc_general_protection+0x222/0x370 arch/x86/kernel/traps.c:530
 asm_exc_general_protection+0x1e/0x30 arch/x86/include/asm/idtentry.h:571


Re: [PATCH v3 06/11] perf: Add support for SIGTRAP on perf events

2021-03-25 Thread Marco Elver
On Wed, Mar 24, 2021 at 12:24PM +0100, Marco Elver wrote:
[...]
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index b6434697c516..1e4c949bf75f 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -6391,6 +6391,17 @@ void perf_event_wakeup(struct perf_event *event)
>   }
>  }
>  
> +static void perf_sigtrap(struct perf_event *event)
> +{
> + struct kernel_siginfo info;
> +

I think we need to add something like this here:

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4b82788fbaab..4fcd6b45ce66 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6395,6 +6395,13 @@ static void perf_sigtrap(struct perf_event *event)
 {
struct kernel_siginfo info;
 
+   /*
+* This irq_work can race with an exiting task; bail out if sighand has
+* already been released in release_task().
+*/
+   if (!current->sighand)
+   return;
+
clear_siginfo();
info.si_signo = SIGTRAP;
info.si_code = TRAP_PERF;


Because syzkaller was able to produce this:

| general protection fault, probably for non-canonical address 
0xdc03:  [#1] PREEMPT SMP KASAN
| KASAN: null-ptr-deref in range [0x0018-0x001f]
| CPU: 0 PID: 28393 Comm: kworker/u9:4 Not tainted 5.12.0-rc4+ #5
| Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 
04/01/2014
| RIP: 0010:__lock_acquire+0x87/0x5e60 kernel/locking/lockdep.c:4770
| Code: 84 c0 48 89 7c 24 78 0f 85 10 26 00 00 83 3d 53 64 59 0c 00 0f 84 84 41 
00 00 83 3d 72 8a 01 0b 00 74 32 48 89 f8 48 c1 e8 03 <80> 3c 30 00 74 19 48 8b 
7c 24 78 e8 79 8b 60 00 48 8b 7c 24 78 48
| RSP: 0018:c9007c00 EFLAGS: 00010006
| RAX: 0003 RBX: 888048058000 RCX: 
| RDX:  RSI: dc00 RDI: 0018
| RBP: c9007da8 R08: 0001 R09: 0001
| R10: fbfff1b6b27e R11:  R12: 0001
| R13:  R14:  R15: 0001
| FS:  () GS:88802ce0() knlGS:
| CS:  0010 DS:  ES:  CR0: 80050033
| CR2: 00970004 CR3: 40d91000 CR4: 00750ef0
| DR0:  DR1:  DR2: 
| DR3:  DR6: 0ff0 DR7: 0600
| PKRU: 5554
| Call Trace:
|  
|  lock_acquire+0x126/0x650 kernel/locking/lockdep.c:5510
|  __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline]
|  _raw_spin_lock_irqsave+0x73/0xa0 kernel/locking/spinlock.c:159
|  force_sig_info_to_task+0x65/0x3f0 kernel/signal.c:1322
|  perf_sigtrap kernel/events/core.c:6418 [inline]
|  perf_pending_event_disable kernel/events/core.c:6433 [inline]
|  perf_pending_event+0x46f/0x620 kernel/events/core.c:6475
|  irq_work_single kernel/irq_work.c:153 [inline]
|  irq_work_run_list kernel/irq_work.c:175 [inline]
|  irq_work_run+0x1da/0x640 kernel/irq_work.c:184
|  __sysvec_irq_work+0x62/0x70 arch/x86/kernel/irq_work.c:22
|  sysvec_irq_work+0x8c/0xb0 arch/x86/kernel/irq_work.c:17
|  
|  asm_sysvec_irq_work+0x12/0x20 arch/x86/include/asm/idtentry.h:658
| RIP: 0010:__raw_write_unlock_irq include/linux/rwlock_api_smp.h:268 [inline]
| RIP: 0010:_raw_write_unlock_irq+0x25/0x40 kernel/locking/spinlock.c:343
| Code: aa fd ff 66 90 53 48 89 fb 48 83 c7 18 48 8b 74 24 08 e8 3e 34 04 f8 48 
89 df e8 a6 1a 06 f8 e8 21 85 26 f8 fb bf 01 00 00 00  56 19 fa f7 65 8b 05 
77 65 a9 76 85 c0 74 02 5b c3 e8 2b c1 a7
| RSP: 0018:c9000202fd68 EFLAGS: 0286
| RAX: 2a7870700b93e400 RBX: 8c40a040 RCX: 8ff9cb03
| RDX:  RSI: 0001 RDI: 0001
| RBP: 888047b24790 R08: 817f0f50 R09: fbfff1b6b27e
| R10: fbfff1b6b27e R11:  R12: 888048058000
| R13: dc00 R14: 888047b24701 R15: 888048058000
|  release_task+0x10bf/0x1360 kernel/exit.c:220
|  exit_notify kernel/exit.c:699 [inline]
|  do_exit+0x19b0/0x2290 kernel/exit.c:845
|  call_usermodehelper_exec_async+0x39c/0x3a0 kernel/umh.c:123
|  ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:294


> + clear_siginfo();
> + info.si_signo = SIGTRAP;
> + info.si_code = TRAP_PERF;
> + info.si_errno = event->attr.type;
> + force_sig_info();
> +}
> +
>  static void perf_pending_event_disable(struct perf_event *event)
>  {
>   int cpu = READ_ONCE(event->pending_disable);
> @@ -6400,6 +6411,13 @@ static void perf_pending_event_disable(struct 
> perf_event *event)
>  
>   if (cpu == smp_processor_id()) {
>   WRITE_ONCE(event->pending_disable, -1);
> +
> + if (event->attr.sigtrap) {
> + atomic_set(>event_limit, 1); /* rearm event */
> + perf_sigtrap(event);
> + return;
> + }
> +
>   perf_event_disable_local(event);
>   return;
>   }
[...] 


Re: [PATCH v3 07/11] perf: Add breakpoint information to siginfo on SIGTRAP

2021-03-25 Thread Marco Elver
On Wed, 24 Mar 2021 at 15:15, Dmitry Vyukov  wrote:
> On Wed, Mar 24, 2021 at 3:12 PM Dmitry Vyukov  wrote:
> > > On Wed, 24 Mar 2021 at 15:01, Peter Zijlstra  wrote:
> > > >
> > > > One last try, I'll leave it alone now, I promise :-)
> > >
> > > This looks like it does what you suggested, thanks! :-)
> > >
> > > I'll still need to think about it, because of the potential problem
> > > with modify-signal-races and what the user's synchronization story
> > > would look like then.
> >
> > I agree that this looks inherently racy. The attr can't be allocated
> > on stack, user synchronization may be tricky and expensive. The API
> > may provoke bugs and some users may not even realize the race problem.
> >
> > One potential alternative is use of an opaque u64 context (if we could
> > shove it into the attr). A user can pass a pointer to the attr in
> > there (makes it equivalent to this proposal), or bit-pack size/type
> > (as we want), pass some sequence number or whatever.
>
> Just to clarify what I was thinking about, but did not really state:
> perf_event_attr_t includes u64 ctx, and we return it back to the user
> in siginfo_t. Kernel does not treat it in any way. This is a pretty
> common API pattern in general.

Ok, let's go for a new field in perf_event_attr which is copied to
si_perf. This gives user space full flexibility to decide what to
stick in it, and the kernel does not prescribe some weird encoding or
synchronization that user space would have to live with. I'll probably
call it perf_event_attr::sig_data, because all si_* things are macros.

Thanks,
-- Marco


Re: [PATCH] kernel: kcov: fix a typo in comment

2021-03-24 Thread Marco Elver
On Thu, 25 Mar 2021 at 00:04, Andrew Morton  wrote:
> On Tue, 23 Mar 2021 23:32:57 +0100 Marco Elver  wrote:
> > On Tue, 23 Mar 2021 at 07:45, 'Dmitry Vyukov' via kasan-dev
> >  wrote:
> > > On Tue, Mar 23, 2021 at 7:24 AM tl455047  wrote:
> > > >
> > > > Fixed a typo in comment.
> > > >
> > > > Signed-off-by: tl455047 
> > >
> > > Reviewed-by: Dmitry Vyukov 
> > >
> > > +Andrew, linux-mm as KCOV patches are generally merged into mm.
> > >
> > > Thanks for the fix
> >
> > FYI, I believe this code may not be accepted due to this:
> >
> > "[...] It is imperative that all code contributed to the kernel be 
> > legitimately
> > free software.  For that reason, code from anonymous (or pseudonymous)
> > contributors will not be accepted."
> >
> > See Documentation/process/1.Intro.rst
>
> Correct.  I let this one pass because the patch is so minor.  But yes,
> a real name would be preferred, please.

I've just seen that the author sent
https://lkml.kernel.org/r/20210324071051.55229-1-tl445047...@gmail.com


Re: [PATCH v3 07/11] perf: Add breakpoint information to siginfo on SIGTRAP

2021-03-24 Thread Marco Elver
On Wed, 24 Mar 2021 at 15:01, Peter Zijlstra  wrote:
>
> One last try, I'll leave it alone now, I promise :-)

This looks like it does what you suggested, thanks! :-)

I'll still need to think about it, because of the potential problem
with modify-signal-races and what the user's synchronization story
would look like then.

> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -778,6 +778,9 @@ struct perf_event {
> void *security;
>  #endif
> struct list_headsb_list;
> +
> +   unsigned long   si_uattr;
> +   unsigned long   si_data;
>  #endif /* CONFIG_PERF_EVENTS */
>  };
>
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -5652,13 +5652,17 @@ static long _perf_ioctl(struct perf_even
> return perf_event_query_prog_array(event, (void __user *)arg);
>
> case PERF_EVENT_IOC_MODIFY_ATTRIBUTES: {
> +   struct perf_event_attr __user *uattr;
> struct perf_event_attr new_attr;
> -   int err = perf_copy_attr((struct perf_event_attr __user *)arg,
> -_attr);
> +   int err;
>
> +   uattr = (struct perf_event_attr __user *)arg;
> +   err = perf_copy_attr(uattr, _attr);
> if (err)
> return err;
>
> +   event->si_uattr = (unsigned long)uattr;
> +
> return perf_event_modify_attr(event,  _attr);
> }
> default:
> @@ -6399,7 +6403,12 @@ static void perf_sigtrap(struct perf_eve
> clear_siginfo();
> info.si_signo = SIGTRAP;
> info.si_code = TRAP_PERF;
> -   info.si_errno = event->attr.type;
> +   info.si_addr = (void *)event->si_data;
> +
> +   info.si_perf = event->si_uattr;
> +   if (event->parent)
> +   info.si_perf = event->parent->si_uattr;
> +
> force_sig_info();
>  }
>
> @@ -6414,8 +6423,8 @@ static void perf_pending_event_disable(s
> WRITE_ONCE(event->pending_disable, -1);
>
> if (event->attr.sigtrap) {
> -   atomic_set(>event_limit, 1); /* rearm event */
> perf_sigtrap(event);
> +   atomic_set_release(>event_limit, 1); /* rearm 
> event */
> return;
> }
>
> @@ -9121,6 +9130,7 @@ static int __perf_event_overflow(struct
> if (events && atomic_dec_and_test(>event_limit)) {
> ret = 1;
> event->pending_kill = POLL_HUP;
> +   event->si_data = data->addr;
>
> perf_event_disable_inatomic(event);
> }
> @@ -12011,6 +12021,8 @@ SYSCALL_DEFINE5(perf_event_open,
> goto err_task;
> }
>
> +   event->si_uattr = (unsigned long)attr_uptr;
> +
> if (is_sampling_event(event)) {
> if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
> err = -EOPNOTSUPP;


Re: [PATCH v3 07/11] perf: Add breakpoint information to siginfo on SIGTRAP

2021-03-24 Thread Marco Elver
On Wed, 24 Mar 2021 at 14:21, Peter Zijlstra  wrote:
>
> On Wed, Mar 24, 2021 at 02:01:56PM +0100, Peter Zijlstra wrote:
> > On Wed, Mar 24, 2021 at 01:53:48PM +0100, Peter Zijlstra wrote:
> > > On Wed, Mar 24, 2021 at 12:24:59PM +0100, Marco Elver wrote:
> > > > Encode information from breakpoint attributes into siginfo_t, which
> > > > helps disambiguate which breakpoint fired.
> > > >
> > > > Note, providing the event fd may be unreliable, since the event may have
> > > > been modified (via PERF_EVENT_IOC_MODIFY_ATTRIBUTES) between the event
> > > > triggering and the signal being delivered to user space.
> > > >
> > > > Signed-off-by: Marco Elver 
> > > > ---
> > > > v2:
> > > > * Add comment about si_perf==0.
> > > > ---
> > > >  kernel/events/core.c | 16 
> > > >  1 file changed, 16 insertions(+)
> > > >
> > > > diff --git a/kernel/events/core.c b/kernel/events/core.c
> > > > index 1e4c949bf75f..0316d39e8c8f 100644
> > > > --- a/kernel/events/core.c
> > > > +++ b/kernel/events/core.c
> > > > @@ -6399,6 +6399,22 @@ static void perf_sigtrap(struct perf_event 
> > > > *event)
> > > >   info.si_signo = SIGTRAP;
> > > >   info.si_code = TRAP_PERF;
> > > >   info.si_errno = event->attr.type;
> > > > +
> > > > + switch (event->attr.type) {
> > > > + case PERF_TYPE_BREAKPOINT:
> > > > + info.si_addr = (void *)(unsigned long)event->attr.bp_addr;
> > > > + info.si_perf = (event->attr.bp_len << 16) | 
> > > > (u64)event->attr.bp_type;
> > >
> > > Ahh, here's the si_perf user. I wasn't really clear to me what was
> > > supposed to be in that field at patch #5 where it was introduced.
> > >
> > > Would it perhaps make sense to put the user address of struct
> > > perf_event_attr in there instead? (Obviously we'd have to carry it from
> > > the syscall to here, but it might be more useful than a random encoding
> > > of some bits therefrom).
> > >
> > > Then we can also clearly document that's in that field, and it might be
> > > more useful for possible other uses.
> >
> > Something like so...
>
> Ok possibly something like so, which also gets the data address right
> for more cases.

It'd be nice if this could work. Though I think there's an inherent
problem (same as with fd) with trying to pass a reference back to the
user, while the user can concurrently modify that reference.

Let's assume that user space creates new copies of perf_event_attr for
every version they want, there's still a race where the user modifies
an event, and concurrently in another thread a signal arrives. I
currently don't see a way to determine when it's safe to free a
perf_event_attr or reuse, without there still being a chance that a
signal arrives due to some old perf_event_attr. And for our usecase,
we really need to know a precise subset out of attr that triggered the
event.

So the safest thing I can see is to stash a copy of the relevant
information in siginfo, which is how we ended up with encoding bits
from perf_event_attr into si_perf.

One way around this I could see is that we know that there's a limited
number of combinations of attrs, and the user just creates an instance
for every version they want (and hope it doesn't exceed some large
number). Of course, for breakpoints, we have bp_addr, but let's assume
that si_addr has the right version, so we won't need to access
perf_event_attr::bp_addr.

But given the additional complexities, I'm not sure it's worth it. Is
there a way to solve the modify-signal-race problem in a nicer way?

Thanks,
-- Marco


[PATCH v3 10/11] tools headers uapi: Sync tools/include/uapi/linux/perf_event.h

2021-03-24 Thread Marco Elver
Sync tool's uapi to pick up the changes adding inherit_thread,
remove_on_exec, and sigtrap fields to perf_event_attr.

Signed-off-by: Marco Elver 
---
v3:
* Added to series.
---
 tools/include/uapi/linux/perf_event.h | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tools/include/uapi/linux/perf_event.h 
b/tools/include/uapi/linux/perf_event.h
index ad15e40d7f5d..3a4dbb1688f0 100644
--- a/tools/include/uapi/linux/perf_event.h
+++ b/tools/include/uapi/linux/perf_event.h
@@ -389,7 +389,10 @@ struct perf_event_attr {
cgroup :  1, /* include cgroup events */
text_poke  :  1, /* include text poke 
events */
build_id   :  1, /* use build id in mmap2 
events */
-   __reserved_1   : 29;
+   inherit_thread :  1, /* children only inherit 
if cloned with CLONE_THREAD */
+   remove_on_exec :  1, /* event is removed from 
task on exec */
+   sigtrap:  1, /* send synchronous 
SIGTRAP on event */
+   __reserved_1   : 26;
 
union {
__u32   wakeup_events;/* wakeup every n events */
-- 
2.31.0.291.g576ba9dcdaf-goog



[PATCH v3 09/11] selftests/perf_events: Add kselftest for remove_on_exec

2021-03-24 Thread Marco Elver
Add kselftest to test that remove_on_exec removes inherited events from
child tasks.

Signed-off-by: Marco Elver 
---
v3:
* Fix for latest libc signal.h.

v2:
* Add patch to series.
---
 .../testing/selftests/perf_events/.gitignore  |   1 +
 tools/testing/selftests/perf_events/Makefile  |   2 +-
 .../selftests/perf_events/remove_on_exec.c| 260 ++
 3 files changed, 262 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/perf_events/remove_on_exec.c

diff --git a/tools/testing/selftests/perf_events/.gitignore 
b/tools/testing/selftests/perf_events/.gitignore
index 4dc43e1bd79c..790c47001e77 100644
--- a/tools/testing/selftests/perf_events/.gitignore
+++ b/tools/testing/selftests/perf_events/.gitignore
@@ -1,2 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0-only
 sigtrap_threads
+remove_on_exec
diff --git a/tools/testing/selftests/perf_events/Makefile 
b/tools/testing/selftests/perf_events/Makefile
index 973a2c39ca83..fcafa5f0d34c 100644
--- a/tools/testing/selftests/perf_events/Makefile
+++ b/tools/testing/selftests/perf_events/Makefile
@@ -2,5 +2,5 @@
 CFLAGS += -Wl,-no-as-needed -Wall -I../../../../usr/include
 LDFLAGS += -lpthread
 
-TEST_GEN_PROGS := sigtrap_threads
+TEST_GEN_PROGS := sigtrap_threads remove_on_exec
 include ../lib.mk
diff --git a/tools/testing/selftests/perf_events/remove_on_exec.c 
b/tools/testing/selftests/perf_events/remove_on_exec.c
new file mode 100644
index ..5814611a1dc7
--- /dev/null
+++ b/tools/testing/selftests/perf_events/remove_on_exec.c
@@ -0,0 +1,260 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test for remove_on_exec.
+ *
+ * Copyright (C) 2021, Google LLC.
+ */
+
+#define _GNU_SOURCE
+
+/* We need the latest siginfo from the kernel repo. */
+#include 
+#include 
+#define __have_siginfo_t 1
+#define __have_sigval_t 1
+#define __have_sigevent_t 1
+#define __siginfo_t_defined
+#define __sigval_t_defined
+#define __sigevent_t_defined
+#define _BITS_SIGINFO_CONSTS_H 1
+#define _BITS_SIGEVENT_CONSTS_H 1
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "../kselftest_harness.h"
+
+static volatile int signal_count;
+
+static struct perf_event_attr make_event_attr(void)
+{
+   struct perf_event_attr attr = {
+   .type   = PERF_TYPE_HARDWARE,
+   .size   = sizeof(attr),
+   .config = PERF_COUNT_HW_INSTRUCTIONS,
+   .sample_period  = 1000,
+   .exclude_kernel = 1,
+   .exclude_hv = 1,
+   .disabled   = 1,
+   .inherit= 1,
+   /*
+* Children normally retain their inherited event on exec; with
+* remove_on_exec, we'll remove their event, but the parent and
+* any other non-exec'd children will keep their events.
+*/
+   .remove_on_exec = 1,
+   .sigtrap= 1,
+   };
+   return attr;
+}
+
+static void sigtrap_handler(int signum, siginfo_t *info, void *ucontext)
+{
+   if (info->si_code != TRAP_PERF) {
+   fprintf(stderr, "%s: unexpected si_code %d\n", __func__, 
info->si_code);
+   return;
+   }
+
+   signal_count++;
+}
+
+FIXTURE(remove_on_exec)
+{
+   struct sigaction oldact;
+   int fd;
+};
+
+FIXTURE_SETUP(remove_on_exec)
+{
+   struct perf_event_attr attr = make_event_attr();
+   struct sigaction action = {};
+
+   signal_count = 0;
+
+   /* Initialize sigtrap handler. */
+   action.sa_flags = SA_SIGINFO | SA_NODEFER;
+   action.sa_sigaction = sigtrap_handler;
+   sigemptyset(_mask);
+   ASSERT_EQ(sigaction(SIGTRAP, , >oldact), 0);
+
+   /* Initialize perf event. */
+   self->fd = syscall(__NR_perf_event_open, , 0, -1, -1, 
PERF_FLAG_FD_CLOEXEC);
+   ASSERT_NE(self->fd, -1);
+}
+
+FIXTURE_TEARDOWN(remove_on_exec)
+{
+   close(self->fd);
+   sigaction(SIGTRAP, >oldact, NULL);
+}
+
+/* Verify event propagates to fork'd child. */
+TEST_F(remove_on_exec, fork_only)
+{
+   int status;
+   pid_t pid = fork();
+
+   if (pid == 0) {
+   ASSERT_EQ(signal_count, 0);
+   ASSERT_EQ(ioctl(self->fd, PERF_EVENT_IOC_ENABLE, 0), 0);
+   while (!signal_count);
+   _exit(42);
+   }
+
+   while (!signal_count); /* Child enables event. */
+   EXPECT_EQ(waitpid(pid, , 0), pid);
+   EXPECT_EQ(WEXITSTATUS(status), 42);
+}
+
+/*
+ * Verify that event does _not_ propagate to fork+exec'd child; event enabled
+ * after fork+exec.
+ */
+TEST_F(remove_on_exec, fork_exec_then_enable)
+{
+   pid_t pid_exec, pid_only_fork;
+   int pipefd[2];
+   int tmp;
+
+   /*
+* Non-exec child, to ensure exec does not affect inherited events of
+* other children.
+*/
+   pid_only_f

[PATCH v3 11/11] perf test: Add basic stress test for sigtrap handling

2021-03-24 Thread Marco Elver
Add basic stress test for sigtrap handling as a perf tool built-in test.
This allows sanity checking the basic sigtrap functionality from within
the perf tool.

Note: A more elaborate kselftest version of this test can also be found
in tools/testing/selftests/perf_events/sigtrap_threads.c.

Signed-off-by: Marco Elver 
---
v3:
* Added to series (per suggestion from Ian Rogers).
---
 tools/perf/tests/Build  |   1 +
 tools/perf/tests/builtin-test.c |   5 ++
 tools/perf/tests/sigtrap.c  | 148 
 tools/perf/tests/tests.h|   1 +
 4 files changed, 155 insertions(+)
 create mode 100644 tools/perf/tests/sigtrap.c

diff --git a/tools/perf/tests/Build b/tools/perf/tests/Build
index 650aec19d490..a429c7a02b37 100644
--- a/tools/perf/tests/Build
+++ b/tools/perf/tests/Build
@@ -64,6 +64,7 @@ perf-y += parse-metric.o
 perf-y += pe-file-parsing.o
 perf-y += expand-cgroup.o
 perf-y += perf-time-to-tsc.o
+perf-y += sigtrap.o
 
 $(OUTPUT)tests/llvm-src-base.c: tests/bpf-script-example.c tests/Build
$(call rule_mkdir)
diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c
index c4b888f18e9c..28a1cb5eaa77 100644
--- a/tools/perf/tests/builtin-test.c
+++ b/tools/perf/tests/builtin-test.c
@@ -359,6 +359,11 @@ static struct test generic_tests[] = {
.func = test__perf_time_to_tsc,
.is_supported = test__tsc_is_supported,
},
+   {
+   .desc = "Sigtrap support",
+   .func = test__sigtrap,
+   .is_supported = test__wp_is_supported, /* uses wp for test */
+   },
{
.func = NULL,
},
diff --git a/tools/perf/tests/sigtrap.c b/tools/perf/tests/sigtrap.c
new file mode 100644
index ..b3f4006c22fd
--- /dev/null
+++ b/tools/perf/tests/sigtrap.c
@@ -0,0 +1,148 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Basic test for sigtrap support.
+ *
+ * Copyright (C) 2021, Google LLC.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "cloexec.h"
+#include "debug.h"
+#include "event.h"
+#include "tests.h"
+#include "../perf-sys.h"
+
+#define NUM_THREADS 5
+
+static struct {
+   int tids_want_signal;   /* Which threads still want a signal. */
+   int signal_count;   /* Sanity check number of signals 
received. */
+   volatile int iterate_on;/* Variable to set breakpoint on. */
+   siginfo_t first_siginfo;/* First observed siginfo_t. */
+} ctx;
+
+static struct perf_event_attr make_event_attr(void)
+{
+   struct perf_event_attr attr = {
+   .type   = PERF_TYPE_BREAKPOINT,
+   .size   = sizeof(attr),
+   .sample_period  = 1,
+   .disabled   = 1,
+   .bp_addr= (long)_on,
+   .bp_type= HW_BREAKPOINT_RW,
+   .bp_len = HW_BREAKPOINT_LEN_1,
+   .inherit= 1, /* Children inherit events ... */
+   .inherit_thread = 1, /* ... but only cloned with CLONE_THREAD. 
*/
+   .remove_on_exec = 1, /* Required by sigtrap. */
+   .sigtrap= 1, /* Request synchronous SIGTRAP on event. */
+   };
+   return attr;
+}
+
+static void
+sigtrap_handler(int signum __maybe_unused, siginfo_t *info, void *ucontext 
__maybe_unused)
+{
+   if (!__atomic_fetch_add(_count, 1, __ATOMIC_RELAXED))
+   ctx.first_siginfo = *info;
+   __atomic_fetch_sub(_want_signal, syscall(SYS_gettid), 
__ATOMIC_RELAXED);
+}
+
+static void *test_thread(void *arg)
+{
+   pthread_barrier_t *barrier = (pthread_barrier_t *)arg;
+   pid_t tid = syscall(SYS_gettid);
+   int i;
+
+   pthread_barrier_wait(barrier);
+
+   __atomic_fetch_add(_want_signal, tid, __ATOMIC_RELAXED);
+   for (i = 0; i < ctx.iterate_on - 1; i++)
+   __atomic_fetch_add(_want_signal, tid, 
__ATOMIC_RELAXED);
+
+   return NULL;
+}
+
+static int run_test_threads(pthread_t *threads, pthread_barrier_t *barrier)
+{
+   int i;
+
+   pthread_barrier_wait(barrier);
+   for (i = 0; i < NUM_THREADS; i++)
+   TEST_ASSERT_EQUAL("pthread_join() failed", 
pthread_join(threads[i], NULL), 0);
+
+   return TEST_OK;
+}
+
+static int run_stress_test(int fd, pthread_t *threads, pthread_barrier_t 
*barrier)
+{
+   int ret;
+
+   ctx.iterate_on = 3000;
+
+   TEST_ASSERT_EQUAL("misfired signal?", ctx.signal_count, 0);
+   TEST_ASSERT_EQUAL("enable failed", ioctl(fd, PERF_EVENT_IOC_ENABLE, 0), 
0);
+   ret = run_test_threads(threads, barrier);
+   TEST_ASSERT_EQUAL("disable failed", ioctl(fd, PERF_EVENT_IOC_DISABLE, 
0), 0);
+
+   TEST_ASSERT_EQUAL("unexpected sigtraps", ctx.signal_count, NUM_THREADS 
* ctx.iterate_on);
+   

[PATCH v3 07/11] perf: Add breakpoint information to siginfo on SIGTRAP

2021-03-24 Thread Marco Elver
Encode information from breakpoint attributes into siginfo_t, which
helps disambiguate which breakpoint fired.

Note, providing the event fd may be unreliable, since the event may have
been modified (via PERF_EVENT_IOC_MODIFY_ATTRIBUTES) between the event
triggering and the signal being delivered to user space.

Signed-off-by: Marco Elver 
---
v2:
* Add comment about si_perf==0.
---
 kernel/events/core.c | 16 
 1 file changed, 16 insertions(+)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 1e4c949bf75f..0316d39e8c8f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6399,6 +6399,22 @@ static void perf_sigtrap(struct perf_event *event)
info.si_signo = SIGTRAP;
info.si_code = TRAP_PERF;
info.si_errno = event->attr.type;
+
+   switch (event->attr.type) {
+   case PERF_TYPE_BREAKPOINT:
+   info.si_addr = (void *)(unsigned long)event->attr.bp_addr;
+   info.si_perf = (event->attr.bp_len << 16) | 
(u64)event->attr.bp_type;
+   break;
+   default:
+   /*
+* No additional info set (si_perf == 0).
+*
+* Adding new cases for event types to set si_perf to a
+* non-constant value must ensure that si_perf != 0.
+*/
+   break;
+   }
+
force_sig_info();
 }
 
-- 
2.31.0.291.g576ba9dcdaf-goog



[PATCH v3 06/11] perf: Add support for SIGTRAP on perf events

2021-03-24 Thread Marco Elver
Adds bit perf_event_attr::sigtrap, which can be set to cause events to
send SIGTRAP (with si_code TRAP_PERF) to the task where the event
occurred. To distinguish perf events and allow user space to decode
si_perf (if set), the event type is set in si_errno.

The primary motivation is to support synchronous signals on perf events
in the task where an event (such as breakpoints) triggered.

Link: 
https://lore.kernel.org/lkml/ybv3rat566k+6...@hirez.programming.kicks-ass.net/
Suggested-by: Peter Zijlstra 
Acked-by: Dmitry Vyukov 
Signed-off-by: Marco Elver 
---
v2:
* Use atomic_set(_count, 1), since it must always be 0 in
  perf_pending_event_disable().
* Implicitly restrict inheriting events if sigtrap, but the child was
  cloned with CLONE_CLEAR_SIGHAND, because it is not generally safe if
  the child cleared all signal handlers to continue sending SIGTRAP.
---
 include/uapi/linux/perf_event.h |  3 ++-
 kernel/events/core.c| 28 +++-
 2 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 8c5b9f5ad63f..3a4dbb1688f0 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -391,7 +391,8 @@ struct perf_event_attr {
build_id   :  1, /* use build id in mmap2 
events */
inherit_thread :  1, /* children only inherit 
if cloned with CLONE_THREAD */
remove_on_exec :  1, /* event is removed from 
task on exec */
-   __reserved_1   : 27;
+   sigtrap:  1, /* send synchronous 
SIGTRAP on event */
+   __reserved_1   : 26;
 
union {
__u32   wakeup_events;/* wakeup every n events */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index b6434697c516..1e4c949bf75f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6391,6 +6391,17 @@ void perf_event_wakeup(struct perf_event *event)
}
 }
 
+static void perf_sigtrap(struct perf_event *event)
+{
+   struct kernel_siginfo info;
+
+   clear_siginfo();
+   info.si_signo = SIGTRAP;
+   info.si_code = TRAP_PERF;
+   info.si_errno = event->attr.type;
+   force_sig_info();
+}
+
 static void perf_pending_event_disable(struct perf_event *event)
 {
int cpu = READ_ONCE(event->pending_disable);
@@ -6400,6 +6411,13 @@ static void perf_pending_event_disable(struct perf_event 
*event)
 
if (cpu == smp_processor_id()) {
WRITE_ONCE(event->pending_disable, -1);
+
+   if (event->attr.sigtrap) {
+   atomic_set(>event_limit, 1); /* rearm event */
+   perf_sigtrap(event);
+   return;
+   }
+
perf_event_disable_local(event);
return;
}
@@ -11428,6 +11446,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 
event->state= PERF_EVENT_STATE_INACTIVE;
 
+   if (event->attr.sigtrap)
+   atomic_set(>event_limit, 1);
+
if (task) {
event->attach_state = PERF_ATTACH_TASK;
/*
@@ -11706,6 +11727,9 @@ static int perf_copy_attr(struct perf_event_attr __user 
*uattr,
if (attr->remove_on_exec && attr->enable_on_exec)
return -EINVAL;
 
+   if (attr->sigtrap && !attr->remove_on_exec)
+   return -EINVAL;
+
 out:
return ret;
 
@@ -12932,7 +12956,9 @@ inherit_task_group(struct perf_event *event, struct 
task_struct *parent,
struct perf_event_context *child_ctx;
 
if (!event->attr.inherit ||
-   (event->attr.inherit_thread && !(clone_flags & CLONE_THREAD))) {
+   (event->attr.inherit_thread && !(clone_flags & CLONE_THREAD)) ||
+   /* Do not inherit if sigtrap and signal handlers were cleared. */
+   (event->attr.sigtrap && (clone_flags & CLONE_CLEAR_SIGHAND))) {
*inherited_all = 0;
return 0;
}
-- 
2.31.0.291.g576ba9dcdaf-goog



[PATCH v3 08/11] selftests/perf_events: Add kselftest for process-wide sigtrap handling

2021-03-24 Thread Marco Elver
Add a kselftest for testing process-wide perf events with synchronous
SIGTRAP on events (using breakpoints). In particular, we want to test
that changes to the event propagate to all children, and the SIGTRAPs
are in fact synchronously sent to the thread where the event occurred.

Note: The "signal_stress" test case is also added later in the series to
perf tool's built-in tests. The test here is more elaborate in that
respect, which on one hand avoids bloating the perf tool unnecessarily,
but we also benefit from structured tests with TAP-compliant output that
the kselftest framework provides.

Signed-off-by: Marco Elver 
---
v3:
* Fix for latest libc signal.h.

v2:
* Patch added to series.
---
 .../testing/selftests/perf_events/.gitignore  |   2 +
 tools/testing/selftests/perf_events/Makefile  |   6 +
 tools/testing/selftests/perf_events/config|   1 +
 tools/testing/selftests/perf_events/settings  |   1 +
 .../selftests/perf_events/sigtrap_threads.c   | 206 ++
 5 files changed, 216 insertions(+)
 create mode 100644 tools/testing/selftests/perf_events/.gitignore
 create mode 100644 tools/testing/selftests/perf_events/Makefile
 create mode 100644 tools/testing/selftests/perf_events/config
 create mode 100644 tools/testing/selftests/perf_events/settings
 create mode 100644 tools/testing/selftests/perf_events/sigtrap_threads.c

diff --git a/tools/testing/selftests/perf_events/.gitignore 
b/tools/testing/selftests/perf_events/.gitignore
new file mode 100644
index ..4dc43e1bd79c
--- /dev/null
+++ b/tools/testing/selftests/perf_events/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+sigtrap_threads
diff --git a/tools/testing/selftests/perf_events/Makefile 
b/tools/testing/selftests/perf_events/Makefile
new file mode 100644
index ..973a2c39ca83
--- /dev/null
+++ b/tools/testing/selftests/perf_events/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0
+CFLAGS += -Wl,-no-as-needed -Wall -I../../../../usr/include
+LDFLAGS += -lpthread
+
+TEST_GEN_PROGS := sigtrap_threads
+include ../lib.mk
diff --git a/tools/testing/selftests/perf_events/config 
b/tools/testing/selftests/perf_events/config
new file mode 100644
index ..ba58ff2203e4
--- /dev/null
+++ b/tools/testing/selftests/perf_events/config
@@ -0,0 +1 @@
+CONFIG_PERF_EVENTS=y
diff --git a/tools/testing/selftests/perf_events/settings 
b/tools/testing/selftests/perf_events/settings
new file mode 100644
index ..6091b45d226b
--- /dev/null
+++ b/tools/testing/selftests/perf_events/settings
@@ -0,0 +1 @@
+timeout=120
diff --git a/tools/testing/selftests/perf_events/sigtrap_threads.c 
b/tools/testing/selftests/perf_events/sigtrap_threads.c
new file mode 100644
index ..398717e2991a
--- /dev/null
+++ b/tools/testing/selftests/perf_events/sigtrap_threads.c
@@ -0,0 +1,206 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test for perf events with SIGTRAP across all threads.
+ *
+ * Copyright (C) 2021, Google LLC.
+ */
+
+#define _GNU_SOURCE
+
+/* We need the latest siginfo from the kernel repo. */
+#include 
+#include 
+#define __have_siginfo_t 1
+#define __have_sigval_t 1
+#define __have_sigevent_t 1
+#define __siginfo_t_defined
+#define __sigval_t_defined
+#define __sigevent_t_defined
+#define _BITS_SIGINFO_CONSTS_H 1
+#define _BITS_SIGEVENT_CONSTS_H 1
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "../kselftest_harness.h"
+
+#define NUM_THREADS 5
+
+/* Data shared between test body, threads, and signal handler. */
+static struct {
+   int tids_want_signal;   /* Which threads still want a signal. */
+   int signal_count;   /* Sanity check number of signals 
received. */
+   volatile int iterate_on;/* Variable to set breakpoint on. */
+   siginfo_t first_siginfo;/* First observed siginfo_t. */
+} ctx;
+
+static struct perf_event_attr make_event_attr(bool enabled, volatile void 
*addr)
+{
+   struct perf_event_attr attr = {
+   .type   = PERF_TYPE_BREAKPOINT,
+   .size   = sizeof(attr),
+   .sample_period  = 1,
+   .disabled   = !enabled,
+   .bp_addr= (long)addr,
+   .bp_type= HW_BREAKPOINT_RW,
+   .bp_len = HW_BREAKPOINT_LEN_1,
+   .inherit= 1, /* Children inherit events ... */
+   .inherit_thread = 1, /* ... but only cloned with CLONE_THREAD. 
*/
+   .remove_on_exec = 1, /* Required by sigtrap. */
+   .sigtrap= 1, /* Request synchronous SIGTRAP on event. */
+   };
+   return attr;
+}
+
+static void sigtrap_handler(int signum, siginfo_t *info, void *ucontext)
+{
+   if (info->si_code != TRAP_PERF) {
+   fprintf(stderr, "%s: unexpected si_code %d\n", __func__, 
info-

[PATCH v3 05/11] signal: Introduce TRAP_PERF si_code and si_perf to siginfo

2021-03-24 Thread Marco Elver
Introduces the TRAP_PERF si_code, and associated siginfo_t field
si_perf. These will be used by the perf event subsystem to send signals
(if requested) to the task where an event occurred.

Acked-by: Geert Uytterhoeven  # m68k
Acked-by: Arnd Bergmann  # asm-generic
Signed-off-by: Marco Elver 
---
 arch/m68k/kernel/signal.c  |  3 +++
 arch/x86/kernel/signal_compat.c|  5 -
 fs/signalfd.c  |  4 
 include/linux/compat.h |  2 ++
 include/linux/signal.h |  1 +
 include/uapi/asm-generic/siginfo.h |  6 +-
 include/uapi/linux/signalfd.h  |  4 +++-
 kernel/signal.c| 11 +++
 8 files changed, 33 insertions(+), 3 deletions(-)

diff --git a/arch/m68k/kernel/signal.c b/arch/m68k/kernel/signal.c
index 349570f16a78..a4b7ee1df211 100644
--- a/arch/m68k/kernel/signal.c
+++ b/arch/m68k/kernel/signal.c
@@ -622,6 +622,9 @@ static inline void siginfo_build_tests(void)
/* _sigfault._addr_pkey */
BUILD_BUG_ON(offsetof(siginfo_t, si_pkey) != 0x12);
 
+   /* _sigfault._perf */
+   BUILD_BUG_ON(offsetof(siginfo_t, si_perf) != 0x10);
+
/* _sigpoll */
BUILD_BUG_ON(offsetof(siginfo_t, si_band)   != 0x0c);
BUILD_BUG_ON(offsetof(siginfo_t, si_fd) != 0x10);
diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c
index a5330ff498f0..0e5d0a7e203b 100644
--- a/arch/x86/kernel/signal_compat.c
+++ b/arch/x86/kernel/signal_compat.c
@@ -29,7 +29,7 @@ static inline void signal_compat_build_tests(void)
BUILD_BUG_ON(NSIGFPE  != 15);
BUILD_BUG_ON(NSIGSEGV != 9);
BUILD_BUG_ON(NSIGBUS  != 5);
-   BUILD_BUG_ON(NSIGTRAP != 5);
+   BUILD_BUG_ON(NSIGTRAP != 6);
BUILD_BUG_ON(NSIGCHLD != 6);
BUILD_BUG_ON(NSIGSYS  != 2);
 
@@ -138,6 +138,9 @@ static inline void signal_compat_build_tests(void)
BUILD_BUG_ON(offsetof(siginfo_t, si_pkey) != 0x20);
BUILD_BUG_ON(offsetof(compat_siginfo_t, si_pkey) != 0x14);
 
+   BUILD_BUG_ON(offsetof(siginfo_t, si_perf) != 0x18);
+   BUILD_BUG_ON(offsetof(compat_siginfo_t, si_perf) != 0x10);
+
CHECK_CSI_OFFSET(_sigpoll);
CHECK_CSI_SIZE  (_sigpoll, 2*sizeof(int));
CHECK_SI_SIZE   (_sigpoll, 4*sizeof(int));
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 456046e15873..040a1142915f 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -134,6 +134,10 @@ static int signalfd_copyinfo(struct signalfd_siginfo 
__user *uinfo,
 #endif
new.ssi_addr_lsb = (short) kinfo->si_addr_lsb;
break;
+   case SIL_PERF_EVENT:
+   new.ssi_addr = (long) kinfo->si_addr;
+   new.ssi_perf = kinfo->si_perf;
+   break;
case SIL_CHLD:
new.ssi_pid= kinfo->si_pid;
new.ssi_uid= kinfo->si_uid;
diff --git a/include/linux/compat.h b/include/linux/compat.h
index 6e65be753603..c8821d966812 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -236,6 +236,8 @@ typedef struct compat_siginfo {
char 
_dummy_pkey[__COMPAT_ADDR_BND_PKEY_PAD];
u32 _pkey;
} _addr_pkey;
+   /* used when si_code=TRAP_PERF */
+   compat_u64 _perf;
};
} _sigfault;
 
diff --git a/include/linux/signal.h b/include/linux/signal.h
index 205526c4003a..1e98548d7cf6 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -43,6 +43,7 @@ enum siginfo_layout {
SIL_FAULT_MCEERR,
SIL_FAULT_BNDERR,
SIL_FAULT_PKUERR,
+   SIL_PERF_EVENT,
SIL_CHLD,
SIL_RT,
SIL_SYS,
diff --git a/include/uapi/asm-generic/siginfo.h 
b/include/uapi/asm-generic/siginfo.h
index d2597000407a..d0bb9125c853 100644
--- a/include/uapi/asm-generic/siginfo.h
+++ b/include/uapi/asm-generic/siginfo.h
@@ -91,6 +91,8 @@ union __sifields {
char _dummy_pkey[__ADDR_BND_PKEY_PAD];
__u32 _pkey;
} _addr_pkey;
+   /* used when si_code=TRAP_PERF */
+   __u64 _perf;
};
} _sigfault;
 
@@ -155,6 +157,7 @@ typedef struct siginfo {
 #define si_lower   _sifields._sigfault._addr_bnd._lower
 #define si_upper   _sifields._sigfault._addr_bnd._upper
 #define si_pkey_sifields._sigfault._addr_pkey._pkey
+#define si_perf_sifields._sigfault._perf
 #define si_band_sifields._sigpoll._band
 #define si_fd  _sifields._sigpoll._fd
 #define si_call_addr   _sifields._sigsys._call_addr
@@ -253,7 +256,8 @@ typedef struct siginfo {
 #define TRAP_BRANCH 3  /* process taken branch trap */
 #define TRAP_HWBKPT 4  /* hardware breakpoint/watchpoint */
 #define TRAP_UNK   5  

[PATCH v3 04/11] perf: Add support for event removal on exec

2021-03-24 Thread Marco Elver
Adds bit perf_event_attr::remove_on_exec, to support removing an event
from a task on exec.

This option supports the case where an event is supposed to be
process-wide only, and should not propagate beyond exec, to limit
monitoring to the original process image only.

Suggested-by: Peter Zijlstra 
Signed-off-by: Marco Elver 
---
v3:
* Rework based on Peter's "perf: Rework perf_event_exit_event()" added
  to the beginning of the series. Intermediate attempts between v2 and
  this v3 can be found here:
  https://lkml.kernel.org/r/yfm6aaksrlf2n...@elver.google.com

v2:
* Add patch to series.
---
 include/uapi/linux/perf_event.h |  3 +-
 kernel/events/core.c| 70 +
 2 files changed, 64 insertions(+), 9 deletions(-)

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 813efb65fea8..8c5b9f5ad63f 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -390,7 +390,8 @@ struct perf_event_attr {
text_poke  :  1, /* include text poke 
events */
build_id   :  1, /* use build id in mmap2 
events */
inherit_thread :  1, /* children only inherit 
if cloned with CLONE_THREAD */
-   __reserved_1   : 28;
+   remove_on_exec :  1, /* event is removed from 
task on exec */
+   __reserved_1   : 27;
 
union {
__u32   wakeup_events;/* wakeup every n events */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 224cbcf6125a..b6434697c516 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4247,6 +4247,57 @@ static void perf_event_enable_on_exec(int ctxn)
put_ctx(clone_ctx);
 }
 
+static void perf_remove_from_owner(struct perf_event *event);
+static void perf_event_exit_event(struct perf_event *event,
+ struct perf_event_context *ctx);
+
+/*
+ * Removes all events from the current task that have been marked
+ * remove-on-exec, and feeds their values back to parent events.
+ */
+static void perf_event_remove_on_exec(int ctxn)
+{
+   struct perf_event_context *ctx, *clone_ctx = NULL;
+   struct perf_event *event, *next;
+   LIST_HEAD(free_list);
+   unsigned long flags;
+   bool modified = false;
+
+   ctx = perf_pin_task_context(current, ctxn);
+   if (!ctx)
+   return;
+
+   mutex_lock(>mutex);
+
+   if (WARN_ON_ONCE(ctx->task != current))
+   goto unlock;
+
+   list_for_each_entry_safe(event, next, >event_list, event_entry) {
+   if (!event->attr.remove_on_exec)
+   continue;
+
+   if (!is_kernel_event(event))
+   perf_remove_from_owner(event);
+
+   modified = true;
+
+   perf_event_exit_event(event, ctx);
+   }
+
+   raw_spin_lock_irqsave(>lock, flags);
+   if (modified)
+   clone_ctx = unclone_ctx(ctx);
+   --ctx->pin_count;
+   raw_spin_unlock_irqrestore(>lock, flags);
+
+unlock:
+   mutex_unlock(>mutex);
+
+   put_ctx(ctx);
+   if (clone_ctx)
+   put_ctx(clone_ctx);
+}
+
 struct perf_read_data {
struct perf_event *event;
bool group;
@@ -7559,18 +7610,18 @@ void perf_event_exec(void)
struct perf_event_context *ctx;
int ctxn;
 
-   rcu_read_lock();
for_each_task_context_nr(ctxn) {
-   ctx = current->perf_event_ctxp[ctxn];
-   if (!ctx)
-   continue;
-
perf_event_enable_on_exec(ctxn);
+   perf_event_remove_on_exec(ctxn);
 
-   perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL,
-  true);
+   rcu_read_lock();
+   ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
+   if (ctx) {
+   perf_iterate_ctx(ctx, perf_event_addr_filters_exec,
+NULL, true);
+   }
+   rcu_read_unlock();
}
-   rcu_read_unlock();
 }
 
 struct remote_output {
@@ -11652,6 +11703,9 @@ static int perf_copy_attr(struct perf_event_attr __user 
*uattr,
if (!attr->inherit && attr->inherit_thread)
return -EINVAL;
 
+   if (attr->remove_on_exec && attr->enable_on_exec)
+   return -EINVAL;
+
 out:
return ret;
 
-- 
2.31.0.291.g576ba9dcdaf-goog



[PATCH v3 03/11] perf: Support only inheriting events if cloned with CLONE_THREAD

2021-03-24 Thread Marco Elver
Adds bit perf_event_attr::inherit_thread, to restricting inheriting
events only if the child was cloned with CLONE_THREAD.

This option supports the case where an event is supposed to be
process-wide only (including subthreads), but should not propagate
beyond the current process's shared environment.

Link: 
https://lore.kernel.org/lkml/ybvj6ejr%2fdy2t...@hirez.programming.kicks-ass.net/
Suggested-by: Peter Zijlstra 
Signed-off-by: Marco Elver 
---
v2:
* Add patch to series.
---
 include/linux/perf_event.h  |  5 +++--
 include/uapi/linux/perf_event.h |  3 ++-
 kernel/events/core.c| 21 ++---
 kernel/fork.c   |  2 +-
 4 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 3d478abf411c..1660039199b2 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -958,7 +958,7 @@ extern void __perf_event_task_sched_in(struct task_struct 
*prev,
   struct task_struct *task);
 extern void __perf_event_task_sched_out(struct task_struct *prev,
struct task_struct *next);
-extern int perf_event_init_task(struct task_struct *child);
+extern int perf_event_init_task(struct task_struct *child, u64 clone_flags);
 extern void perf_event_exit_task(struct task_struct *child);
 extern void perf_event_free_task(struct task_struct *task);
 extern void perf_event_delayed_put(struct task_struct *task);
@@ -1449,7 +1449,8 @@ perf_event_task_sched_in(struct task_struct *prev,
 static inline void
 perf_event_task_sched_out(struct task_struct *prev,
  struct task_struct *next) { }
-static inline int perf_event_init_task(struct task_struct *child)  { 
return 0; }
+static inline int perf_event_init_task(struct task_struct *child,
+  u64 clone_flags) { 
return 0; }
 static inline void perf_event_exit_task(struct task_struct *child) { }
 static inline void perf_event_free_task(struct task_struct *task)  { }
 static inline void perf_event_delayed_put(struct task_struct *task){ }
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index ad15e40d7f5d..813efb65fea8 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -389,7 +389,8 @@ struct perf_event_attr {
cgroup :  1, /* include cgroup events */
text_poke  :  1, /* include text poke 
events */
build_id   :  1, /* use build id in mmap2 
events */
-   __reserved_1   : 29;
+   inherit_thread :  1, /* children only inherit 
if cloned with CLONE_THREAD */
+   __reserved_1   : 28;
 
union {
__u32   wakeup_events;/* wakeup every n events */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 37d106837962..224cbcf6125a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -11649,6 +11649,9 @@ static int perf_copy_attr(struct perf_event_attr __user 
*uattr,
(attr->sample_type & PERF_SAMPLE_WEIGHT_STRUCT))
return -EINVAL;
 
+   if (!attr->inherit && attr->inherit_thread)
+   return -EINVAL;
+
 out:
return ret;
 
@@ -12869,12 +12872,13 @@ static int
 inherit_task_group(struct perf_event *event, struct task_struct *parent,
   struct perf_event_context *parent_ctx,
   struct task_struct *child, int ctxn,
-  int *inherited_all)
+  u64 clone_flags, int *inherited_all)
 {
int ret;
struct perf_event_context *child_ctx;
 
-   if (!event->attr.inherit) {
+   if (!event->attr.inherit ||
+   (event->attr.inherit_thread && !(clone_flags & CLONE_THREAD))) {
*inherited_all = 0;
return 0;
}
@@ -12906,7 +12910,8 @@ inherit_task_group(struct perf_event *event, struct 
task_struct *parent,
 /*
  * Initialize the perf_event context in task_struct
  */
-static int perf_event_init_context(struct task_struct *child, int ctxn)
+static int perf_event_init_context(struct task_struct *child, int ctxn,
+  u64 clone_flags)
 {
struct perf_event_context *child_ctx, *parent_ctx;
struct perf_event_context *cloned_ctx;
@@ -12946,7 +12951,8 @@ static int perf_event_init_context(struct task_struct 
*child, int ctxn)
 */
perf_event_groups_for_each(event, _ctx->pinned_groups) {
ret = inherit_task_group(event, parent, parent_ctx,
-child, ctxn, _all);
+child, ctxn, clone_flags,
+ 

[PATCH v3 02/11] perf: Apply PERF_EVENT_IOC_MODIFY_ATTRIBUTES to children

2021-03-24 Thread Marco Elver
As with other ioctls (such as PERF_EVENT_IOC_{ENABLE,DISABLE}), fix up
handling of PERF_EVENT_IOC_MODIFY_ATTRIBUTES to also apply to children.

Link: https://lkml.kernel.org/r/ybqvay8atmyto...@hirez.programming.kicks-ass.net
Suggested-by: Dmitry Vyukov 
Reviewed-by: Dmitry Vyukov 
Signed-off-by: Marco Elver 
---
 kernel/events/core.c | 22 +-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 57de8d436efd..37d106837962 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3199,16 +3199,36 @@ static int perf_event_modify_breakpoint(struct 
perf_event *bp,
 static int perf_event_modify_attr(struct perf_event *event,
  struct perf_event_attr *attr)
 {
+   int (*func)(struct perf_event *, struct perf_event_attr *);
+   struct perf_event *child;
+   int err;
+
if (event->attr.type != attr->type)
return -EINVAL;
 
switch (event->attr.type) {
case PERF_TYPE_BREAKPOINT:
-   return perf_event_modify_breakpoint(event, attr);
+   func = perf_event_modify_breakpoint;
+   break;
default:
/* Place holder for future additions. */
return -EOPNOTSUPP;
}
+
+   WARN_ON_ONCE(event->ctx->parent_ctx);
+
+   mutex_lock(>child_mutex);
+   err = func(event, attr);
+   if (err)
+   goto out;
+   list_for_each_entry(child, >child_list, child_list) {
+   err = func(child, attr);
+   if (err)
+   goto out;
+   }
+out:
+   mutex_unlock(>child_mutex);
+   return err;
 }
 
 static void ctx_sched_out(struct perf_event_context *ctx,
-- 
2.31.0.291.g576ba9dcdaf-goog



[PATCH v3 01/11] perf: Rework perf_event_exit_event()

2021-03-24 Thread Marco Elver
From: Peter Zijlstra 

Make perf_event_exit_event() more robust, such that we can use it from
other contexts. Specifically the up and coming remove_on_exec.

For this to work we need to address a few issues. Remove_on_exec will
not destroy the entire context, so we cannot rely on TASK_TOMBSTONE to
disable event_function_call() and we thus have to use
perf_remove_from_context().

When using perf_remove_from_context(), there's two races to consider.
The first is against close(), where we can have concurrent tear-down
of the event. The second is against child_list iteration, which should
not find a half baked event.

To address this, teach perf_remove_from_context() to special case
!ctx->is_active and about DETACH_CHILD.

Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Marco Elver 
---
v3:
* New dependency for series:
  https://lkml.kernel.org/r/YFn/i3akf+toj...@hirez.programming.kicks-ass.net
---
 include/linux/perf_event.h |   1 +
 kernel/events/core.c   | 142 +
 2 files changed, 80 insertions(+), 63 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 3f7f89ea5e51..3d478abf411c 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -607,6 +607,7 @@ struct swevent_hlist {
 #define PERF_ATTACH_TASK_DATA  0x08
 #define PERF_ATTACH_ITRACE 0x10
 #define PERF_ATTACH_SCHED_CB   0x20
+#define PERF_ATTACH_CHILD  0x40
 
 struct perf_cgroup;
 struct perf_buffer;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 03db40f6cba9..57de8d436efd 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2204,6 +2204,26 @@ static void perf_group_detach(struct perf_event *event)
perf_event__header_size(leader);
 }
 
+static void sync_child_event(struct perf_event *child_event);
+
+static void perf_child_detach(struct perf_event *event)
+{
+   struct perf_event *parent_event = event->parent;
+
+   if (!(event->attach_state & PERF_ATTACH_CHILD))
+   return;
+
+   event->attach_state &= ~PERF_ATTACH_CHILD;
+
+   if (WARN_ON_ONCE(!parent_event))
+   return;
+
+   lockdep_assert_held(_event->child_mutex);
+
+   sync_child_event(event);
+   list_del_init(>child_list);
+}
+
 static bool is_orphaned_event(struct perf_event *event)
 {
return event->state == PERF_EVENT_STATE_DEAD;
@@ -2311,6 +2331,7 @@ group_sched_out(struct perf_event *group_event,
 }
 
 #define DETACH_GROUP   0x01UL
+#define DETACH_CHILD   0x02UL
 
 /*
  * Cross CPU call to remove a performance event
@@ -2334,6 +2355,8 @@ __perf_remove_from_context(struct perf_event *event,
event_sched_out(event, cpuctx, ctx);
if (flags & DETACH_GROUP)
perf_group_detach(event);
+   if (flags & DETACH_CHILD)
+   perf_child_detach(event);
list_del_event(event, ctx);
 
if (!ctx->nr_events && ctx->is_active) {
@@ -2362,25 +2385,21 @@ static void perf_remove_from_context(struct perf_event 
*event, unsigned long fla
 
lockdep_assert_held(>mutex);
 
-   event_function_call(event, __perf_remove_from_context, (void *)flags);
-
/*
-* The above event_function_call() can NO-OP when it hits
-* TASK_TOMBSTONE. In that case we must already have been detached
-* from the context (by perf_event_exit_event()) but the grouping
-* might still be in-tact.
+* Because of perf_event_exit_task(), perf_remove_from_context() ought
+* to work in the face of TASK_TOMBSTONE, unlike every other
+* event_function_call() user.
 */
-   WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
-   if ((flags & DETACH_GROUP) &&
-   (event->attach_state & PERF_ATTACH_GROUP)) {
-   /*
-* Since in that case we cannot possibly be scheduled, simply
-* detach now.
-*/
-   raw_spin_lock_irq(>lock);
-   perf_group_detach(event);
+   raw_spin_lock_irq(>lock);
+   if (!ctx->is_active) {
+   __perf_remove_from_context(event, __get_cpu_context(ctx),
+  ctx, (void *)flags);
raw_spin_unlock_irq(>lock);
+   return;
}
+   raw_spin_unlock_irq(>lock);
+
+   event_function_call(event, __perf_remove_from_context, (void *)flags);
 }
 
 /*
@@ -12373,14 +12392,17 @@ void perf_pmu_migrate_context(struct pmu *pmu, int 
src_cpu, int dst_cpu)
 }
 EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
 
-static void sync_child_event(struct perf_event *child_event,
-  struct task_struct *child)
+static void sync_child_event(struct perf_event *child_event)
 {
struct perf_event *parent_event = child_event->parent;
u64 child_val;
 
-   if (child_event->attr.inherit_stat)

[PATCH v3 00/11] Add support for synchronous signals on perf events

2021-03-24 Thread Marco Elver
The perf subsystem today unifies various tracing and monitoring
features, from both software and hardware. One benefit of the perf
subsystem is automatically inheriting events to child tasks, which
enables process-wide events monitoring with low overheads. By default
perf events are non-intrusive, not affecting behaviour of the tasks
being monitored.

For certain use-cases, however, it makes sense to leverage the
generality of the perf events subsystem and optionally allow the tasks
being monitored to receive signals on events they are interested in.
This patch series adds the option to synchronously signal user space on
events.

To better support process-wide synchronous self-monitoring, without
events propagating to children that do not share the current process's
shared environment, two pre-requisite patches are added to optionally
restrict inheritance to CLONE_THREAD, and remove events on exec (without
affecting the parent).

Examples how to use these features can be found in the tests added at
the end of the series. In addition to the tests added, the series has
also been subjected to syzkaller fuzzing (focus on 'kernel/events/'
coverage).

Motivation and Example Uses
---

1.  Our immediate motivation is low-overhead sampling-based race
detection for user space [1]. By using perf_event_open() at
process initialization, we can create hardware
breakpoint/watchpoint events that are propagated automatically
to all threads in a process. As far as we are aware, today no
existing kernel facility (such as ptrace) allows us to set up
process-wide watchpoints with minimal overheads (that are
comparable to mprotect() of whole pages).

2.  Other low-overhead error detectors that rely on detecting
accesses to certain memory locations or code, process-wide and
also only in a specific set of subtasks or threads.

[1] https://llvm.org/devmtg/2020-09/slides/Morehouse-GWP-Tsan.pdf

Other ideas for use-cases we found interesting, but should only
illustrate the range of potential to further motivate the utility (we're
sure there are more):

3.  Code hot patching without full stop-the-world. Specifically, by
setting a code breakpoint to entry to the patched routine, then
send signals to threads and check that they are not in the
routine, but without stopping them further. If any of the
threads will enter the routine, it will receive SIGTRAP and
pause.

4.  Safepoints without mprotect(). Some Java implementations use
"load from a known memory location" as a safepoint. When threads
need to be stopped, the page containing the location is
mprotect()ed and threads get a signal. This could be replaced with
a watchpoint, which does not require a whole page nor DTLB
shootdowns.

5.  Threads receiving signals on performance events to
throttle/unthrottle themselves.

6.  Tracking data flow globally.

Changelog
-

v3:
* Add patch "perf: Rework perf_event_exit_event()" to beginning of
  series, courtesy of Peter Zijlstra.
* Rework "perf: Add support for event removal on exec" based on
  the added "perf: Rework perf_event_exit_event()".
* Fix kselftests to work with more recent libc, due to the way it forces
  using the kernel's own siginfo_t.
* Add basic perf-tool built-in test.

v2/RFC: https://lkml.kernel.org/r/20210310104139.679618-1-el...@google.com
* Patch "Support only inheriting events if cloned with CLONE_THREAD"
  added to series.
* Patch "Add support for event removal on exec" added to series.
* Patch "Add kselftest for process-wide sigtrap handling" added to
  series.
* Patch "Add kselftest for remove_on_exec" added to series.
* Implicitly restrict inheriting events if sigtrap, but the child was
  cloned with CLONE_CLEAR_SIGHAND, because it is not generally safe if
  the child cleared all signal handlers to continue sending SIGTRAP.
* Various minor fixes (see details in patches).

v1/RFC: https://lkml.kernel.org/r/20210223143426.2412737-1-el...@google.com

Pre-series: The discussion at [2] led to the changes in this series. The
approach taken in "Add support for SIGTRAP on perf events" to trigger
the signal was suggested by Peter Zijlstra in [3].

[2] 
https://lore.kernel.org/lkml/CACT4Y+YPrXGw+AtESxAgPyZ84TYkNZdP0xpocX2jwVAbZD=-x...@mail.gmail.com/

[3] 
https://lore.kernel.org/lkml/ybv3rat566k+6...@hirez.programming.kicks-ass.net/


Marco Elver (10):
  perf: Apply PERF_EVENT_IOC_MODIFY_ATTRIBUTES to children
  perf: Support only inheriting events if cloned with CLONE_THREAD
  perf: Add support for event removal on exec
  signal: Introduce TRAP_PERF si_code and si_perf to siginfo
  perf: Add support for SIGTRAP on perf events
  perf: Add breakpoint information to siginfo on SIGTRAP
  selftests/perf_even

Re: [PATCH] kernel: kcov: fix a typo in comment

2021-03-23 Thread Marco Elver
On Tue, 23 Mar 2021 at 07:45, 'Dmitry Vyukov' via kasan-dev
 wrote:
> On Tue, Mar 23, 2021 at 7:24 AM tl455047  wrote:
> >
> > Fixed a typo in comment.
> >
> > Signed-off-by: tl455047 
>
> Reviewed-by: Dmitry Vyukov 
>
> +Andrew, linux-mm as KCOV patches are generally merged into mm.
>
> Thanks for the fix

FYI, I believe this code may not be accepted due to this:

"[...] It is imperative that all code contributed to the kernel be legitimately
free software.  For that reason, code from anonymous (or pseudonymous)
contributors will not be accepted."

See Documentation/process/1.Intro.rst


Re: [PATCH RFC v2 8/8] selftests/perf: Add kselftest for remove_on_exec

2021-03-23 Thread Marco Elver
On Tue, Mar 23, 2021 at 10:47AM +0100, Marco Elver wrote:
> On Tue, 23 Mar 2021 at 04:10, Ian Rogers  wrote:
> > On Mon, Mar 22, 2021 at 6:24 AM Marco Elver  wrote:
> > > On Wed, Mar 10, 2021 at 11:41AM +0100, Marco Elver wrote:
> > > > Add kselftest to test that remove_on_exec removes inherited events from
> > > > child tasks.
> > > >
> > > > Signed-off-by: Marco Elver 
> > >
> > > To make compatible with more recent libc, we'll need to fixup the tests
> > > with the below.
> > >
> > > Also, I've seen that tools/perf/tests exists, however it seems to be
> > > primarily about perf-tool related tests. Is this correct?
> > >
> > > I'd propose to keep these purely kernel ABI related tests separate, and
> > > that way we can also make use of the kselftests framework which will
> > > also integrate into various CI systems such as kernelci.org.
> >
> > Perhaps there is a way to have both? Having the perf tool spot an
> > errant kernel feels like a feature. There are also
> > tools/lib/perf/tests and Vince Weaver's tests [1]. It is possible to
> > run standalone tests from within perf test by having them be executed
> > by a shell test.
> 
> Thanks for the pointers. Sure, I'd support more additional tests.
> 
> But I had another look and it seems the tests in
> tools/{perf,lib/perf}/tests do focus on perf-tool or the library
> respectively, so adding kernel ABI tests there feels wrong. (If
> perf-tool somehow finds use for sigtrap, or remove_on_exec, then
> having a perf-tool specific test for those would make sense again.)

Ok, I checked once more, and I did find a few pure kernel ABI tests e.g.
in "wp.c".

[...]
> Because I'd much prefer in-tree tests with little boilerplate, that
> are structured with parsable output; in the kernel we have the
> kselftest framework for tests with a user space component, and KUnit
> for pure in-kernel tests.

So let's try to have both... but from what I could tell, the
remove_on_exec test just can't be turned into a perf tool built-in test,
at least not easily. In perf tool I also can't use the new "si_perf"
field yet.

I'll add the patch below at the end of the series, so that we can have
both. Too many tests probably don't hurt...

Thanks,
-- Marco

-- >8 --


commit 6a98611ace59c867aa135f780b1879990180548e
Author: Marco Elver 
Date:   Tue Mar 23 19:51:12 2021 +0100

perf test: Add basic stress test for sigtrap handling

Ports the stress test from tools/testing/selftests/sigtrap_threads.c,
and add as a perf tool built-in test. This allows checking the basic
sigtrap functionality from within the perf tool.

Signed-off-by: Marco Elver 

diff --git a/tools/perf/tests/Build b/tools/perf/tests/Build
index 650aec19d490..a429c7a02b37 100644
--- a/tools/perf/tests/Build
+++ b/tools/perf/tests/Build
@@ -64,6 +64,7 @@ perf-y += parse-metric.o
 perf-y += pe-file-parsing.o
 perf-y += expand-cgroup.o
 perf-y += perf-time-to-tsc.o
+perf-y += sigtrap.o
 
 $(OUTPUT)tests/llvm-src-base.c: tests/bpf-script-example.c tests/Build
$(call rule_mkdir)
diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c
index c4b888f18e9c..28a1cb5eaa77 100644
--- a/tools/perf/tests/builtin-test.c
+++ b/tools/perf/tests/builtin-test.c
@@ -359,6 +359,11 @@ static struct test generic_tests[] = {
.func = test__perf_time_to_tsc,
.is_supported = test__tsc_is_supported,
},
+   {
+   .desc = "Sigtrap support",
+   .func = test__sigtrap,
+   .is_supported = test__wp_is_supported, /* uses wp for test */
+   },
{
.func = NULL,
},
diff --git a/tools/perf/tests/sigtrap.c b/tools/perf/tests/sigtrap.c
new file mode 100644
index ..0888a4e0
--- /dev/null
+++ b/tools/perf/tests/sigtrap.c
@@ -0,0 +1,153 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Basic stress-test for sigtrap support.
+ *
+ * Copyright (C) 2021, Google LLC.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "tests.h"
+#include "debug.h"
+#include "event.h"
+#include "cloexec.h"
+#include "../perf-sys.h"
+
+#define NUM_THREADS 5
+
+/* Data shared between test body, threads, and signal handler. */
+static struct {
+   int tids_want_signal;   /* Which threads still want a signal. */
+   int signal_count;   /* Sanity check number of signals 
received. */
+   volatile int iterate_on;/* Variable to set breakpoint on. */
+   siginfo_t first_siginfo;/* First observed siginfo_t. */
+} ctx;
+
+static struct perf_event_attr m

Re: [PATCH RFC v2 8/8] selftests/perf: Add kselftest for remove_on_exec

2021-03-23 Thread Marco Elver
On Tue, Mar 23, 2021 at 03:45PM +0100, Peter Zijlstra wrote:
> On Tue, Mar 23, 2021 at 11:32:03AM +0100, Peter Zijlstra wrote:
> > And at that point there's very little value in still using
> > perf_event_exit_event()... let me see if there's something to be done
> > about that.
> 
> I ended up with something like the below. Which then simplifies
> remove_on_exec() to:
> 
[...]
> 
> Very lightly tested with that {1..1000} thing.
> 
> ---
> 
> Subject: perf: Rework perf_event_exit_event()
> From: Peter Zijlstra 
> Date: Tue Mar 23 15:16:06 CET 2021
> 
> Make perf_event_exit_event() more robust, such that we can use it from
> other contexts. Specifically the up and coming remove_on_exec.
> 
> For this to work we need to address a few issues. Remove_on_exec will
> not destroy the entire context, so we cannot rely on TASK_TOMBSTONE to
> disable event_function_call() and we thus have to use
> perf_remove_from_context().
> 
> When using perf_remove_from_context(), there's two races to consider.
> The first is against close(), where we can have concurrent tear-down
> of the event. The second is against child_list iteration, which should
> not find a half baked event.
> 
> To address this, teach perf_remove_from_context() to special case
> !ctx->is_active and about DETACH_CHILD.
> 
> Signed-off-by: Peter Zijlstra (Intel) 

Very nice, thanks! It seems to all hold up to testing as well.

Unless you already have this on some branch somewhere, I'll prepend it
to the series for now. I'll test some more and try to get v3 out
tomorrow.

Thanks,
-- Marco


Re: [PATCH] kasan: fix hwasan build for gcc

2021-03-23 Thread Marco Elver
On Tue, 23 Mar 2021 at 13:41, Arnd Bergmann  wrote:
>
> From: Arnd Bergmann 
>
> gcc-11 adds support for -fsanitize=kernel-hwaddress, so it becomes
> possible to enable CONFIG_KASAN_SW_TAGS.
>
> Unfortunately this fails to build at the moment, because the
> corresponding command line arguments use llvm specific syntax.
>
> Change it to use the cc-param macro instead, which works on both
> clang and gcc.
>
> Signed-off-by: Arnd Bergmann 

Reviewed-by: Marco Elver 

Although I think you need to rebase against either -mm or -next,
because there have been changes to the CONFIG_KASAN_STACK variable.

> ---
>  scripts/Makefile.kasan | 8 
>  1 file changed, 4 insertions(+), 4 deletions(-)
>
> diff --git a/scripts/Makefile.kasan b/scripts/Makefile.kasan
> index 1e000cc2e7b4..0a2789783d1b 100644
> --- a/scripts/Makefile.kasan
> +++ b/scripts/Makefile.kasan
> @@ -36,14 +36,14 @@ endif # CONFIG_KASAN_GENERIC
>  ifdef CONFIG_KASAN_SW_TAGS
>
>  ifdef CONFIG_KASAN_INLINE
> -instrumentation_flags := -mllvm 
> -hwasan-mapping-offset=$(KASAN_SHADOW_OFFSET)
> +instrumentation_flags := $(call 
> cc-param,hwasan-mapping-offset=$(KASAN_SHADOW_OFFSET))
>  else
> -instrumentation_flags := -mllvm -hwasan-instrument-with-calls=1
> +instrumentation_flags := $(call cc-param,hwasan-instrument-with-calls=1)
>  endif
>
>  CFLAGS_KASAN := -fsanitize=kernel-hwaddress \
> -   -mllvm -hwasan-instrument-stack=$(CONFIG_KASAN_STACK) \
> -   -mllvm -hwasan-use-short-granules=0 \
> +   $(call 
> cc-param,hwasan-instrument-stack=$(CONFIG_KASAN_STACK)) \
> +   $(call cc-param,hwasan-use-short-granules=0) \
> $(instrumentation_flags)
>
>  endif # CONFIG_KASAN_SW_TAGS
> --
> 2.29.2
>


Re: [PATCH RFC v2 8/8] selftests/perf: Add kselftest for remove_on_exec

2021-03-23 Thread Marco Elver
On Tue, Mar 23, 2021 at 11:41AM +0100, Marco Elver wrote:
> On Tue, 23 Mar 2021 at 11:32, Peter Zijlstra  wrote:
[...]
> > > + if (parent_event) {
> > >   /*
> > > +  * Remove event from parent, to avoid race where the
> > > +  * parent concurrently iterates through its 
> > > children to
> > > +  * enable, disable, or otherwise modify an event.
> > >*/
> > > + mutex_lock(_event->child_mutex);
> > > + list_del_init(>child_list);
> > > + mutex_unlock(_event->child_mutex);
> > >   }
> >
> > ^^^ this, right?
> >
> > But that's something perf_event_exit_event() alread does. So then you're
> > worried about the order of things.
> 
> Correct. We somehow need to prohibit the parent from doing an
> event_function_call() while we potentially deactivate the context with
> perf_remove_from_context().
> 
> > > +
> > > + perf_remove_from_context(event, !!event->parent * 
> > > DETACH_GROUP);
> > > + perf_event_exit_event(event, ctx, current, true);
> > >   }
> >
> > perf_event_release_kernel() first does perf_remove_from_context() and
> > then clears the child_list, and that makes sense because if we're there,
> > there's no external access anymore, the filedesc is gone and nobody will
> > be iterating child_list anymore.
> >
> > perf_event_exit_task_context() and perf_event_exit_event() OTOH seem to
> > rely on ctx->task == TOMBSTONE to sabotage event_function_call() such
> > that if anybody is iterating the child_list, it'll NOP out.
> >
> > But here we don't have neither, and thus need to worry about the order
> > vs child_list iteration.
> >
> > I suppose we should stick sync_child_event() in there as well.
> >
> > And at that point there's very little value in still using
> > perf_event_exit_event()... let me see if there's something to be done
> > about that.
> 
> I don't mind dropping use of perf_event_exit_event() and open coding
> all of this. That would also avoid modifying perf_event_exit_event().
> 
> But I leave it to you what you think is nicest.

I played a bit more with it, and the below would be the version without
using perf_event_exit_event(). Perhaps it isn't too bad.

Thanks,
-- Marco

-- >8 --

diff --git a/kernel/events/core.c b/kernel/events/core.c
index aa47e111435e..288b61820dab 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2165,8 +2165,9 @@ static void perf_group_detach(struct perf_event *event)
 * If this is a sibling, remove it from its group.
 */
if (leader != event) {
+   leader->nr_siblings--;
list_del_init(>sibling_list);
-   event->group_leader->nr_siblings--;
+   event->group_leader = event;
goto out;
}
 
@@ -2180,8 +2181,9 @@ static void perf_group_detach(struct perf_event *event)
if (sibling->event_caps & PERF_EV_CAP_SIBLING)
perf_remove_sibling_event(sibling);
 
-   sibling->group_leader = sibling;
+   leader->nr_siblings--;
list_del_init(>sibling_list);
+   sibling->group_leader = sibling;
 
/* Inherit group flags from the previous leader */
sibling->group_caps = event->group_caps;
@@ -2358,10 +2360,19 @@ __perf_remove_from_context(struct perf_event *event,
 static void perf_remove_from_context(struct perf_event *event, unsigned long 
flags)
 {
struct perf_event_context *ctx = event->ctx;
+   bool remove;
 
lockdep_assert_held(>mutex);
 
-   event_function_call(event, __perf_remove_from_context, (void *)flags);
+   /*
+* There is concurrency vs remove_on_exec().
+*/
+   raw_spin_lock_irq(>lock);
+   remove = (event->attach_state & PERF_ATTACH_CONTEXT);
+   raw_spin_unlock_irq(>lock);
+
+   if (remove)
+   event_function_call(event, __perf_remove_from_context, (void 
*)flags);
 
/*
 * The above event_function_call() can NO-OP when it hits
@@ -4196,43 +4207,86 @@ static void perf_event_enable_on_exec(int ctxn)
 }
 
 static void perf_remove_from_owner(struct perf_event *event);
-static void perf_event_exit_event(struct perf_event *child_event,
- struct perf_event_context *child_ctx,
- struct task_struct *child);
+static void sync_child_event(struct perf_event *ch

Re: [PATCH RFC v2 8/8] selftests/perf: Add kselftest for remove_on_exec

2021-03-23 Thread Marco Elver
On Tue, 23 Mar 2021 at 11:32, Peter Zijlstra  wrote:
>
> On Tue, Mar 23, 2021 at 10:52:41AM +0100, Marco Elver wrote:
>
> > with efs->func==__perf_event_enable. I believe it's sufficient to add
> >
> >   mutex_lock(_event->child_mutex);
> >   list_del_init(>child_list);
> >   mutex_unlock(_event->child_mutex);
> >
> > right before removing from context. With the version I have now (below
> > for completeness), extended torture with the above test results in no
> > more warnings and the test also passes.
> >
>
> > + list_for_each_entry_safe(event, next, >event_list, event_entry) {
> > + struct perf_event *parent_event = event->parent;
> > +
> > + if (!event->attr.remove_on_exec)
> >   continue;
> >
> > + if (!is_kernel_event(event))
> > + perf_remove_from_owner(event);
> >
> > + modified = true;
> > +
> > + if (parent_event) {
> >   /*
> > +  * Remove event from parent, to avoid race where the
> > +  * parent concurrently iterates through its children 
> > to
> > +  * enable, disable, or otherwise modify an event.
> >*/
> > + mutex_lock(_event->child_mutex);
> > + list_del_init(>child_list);
> > + mutex_unlock(_event->child_mutex);
> >   }
>
> ^^^ this, right?
>
> But that's something perf_event_exit_event() alread does. So then you're
> worried about the order of things.

Correct. We somehow need to prohibit the parent from doing an
event_function_call() while we potentially deactivate the context with
perf_remove_from_context().

> > +
> > + perf_remove_from_context(event, !!event->parent * 
> > DETACH_GROUP);
> > + perf_event_exit_event(event, ctx, current, true);
> >   }
>
> perf_event_release_kernel() first does perf_remove_from_context() and
> then clears the child_list, and that makes sense because if we're there,
> there's no external access anymore, the filedesc is gone and nobody will
> be iterating child_list anymore.
>
> perf_event_exit_task_context() and perf_event_exit_event() OTOH seem to
> rely on ctx->task == TOMBSTONE to sabotage event_function_call() such
> that if anybody is iterating the child_list, it'll NOP out.
>
> But here we don't have neither, and thus need to worry about the order
> vs child_list iteration.
>
> I suppose we should stick sync_child_event() in there as well.
>
> And at that point there's very little value in still using
> perf_event_exit_event()... let me see if there's something to be done
> about that.

I don't mind dropping use of perf_event_exit_event() and open coding
all of this. That would also avoid modifying perf_event_exit_event().

But I leave it to you what you think is nicest.

Thanks,
-- Marco


Re: [PATCH RFC v2 8/8] selftests/perf: Add kselftest for remove_on_exec

2021-03-23 Thread Marco Elver
On Mon, Mar 22, 2021 at 05:42PM +0100, Peter Zijlstra wrote:
> On Mon, Mar 22, 2021 at 02:24:40PM +0100, Marco Elver wrote:
> > To make compatible with more recent libc, we'll need to fixup the tests
> > with the below.
> 
> OK, that reprodiced things here, thanks!
> 
> The below seems to not explode instantly it still has the
> alternative version in as well (and I think it might even work too, but
> the one I left in seems simpler).

Thanks! Unfortunately neither version worked if I tortured it a little
with this:

for x in {1..1000}; do ( 
tools/testing/selftests/perf_events/remove_on_exec & ); done

Which resulted in the 2 warnings:

WARNING: CPU: 1 PID: 795 at kernel/events/core.c:242 
event_function+0xf3/0x100
WARNING: CPU: 1 PID: 795 at kernel/events/core.c:247 
event_function+0xef/0x100

with efs->func==__perf_event_enable. I believe it's sufficient to add

mutex_lock(_event->child_mutex);
list_del_init(>child_list);
mutex_unlock(_event->child_mutex);

right before removing from context. With the version I have now (below
for completeness), extended torture with the above test results in no
more warnings and the test also passes.


I'd be happy to send a non-RFC v3 with all that squashed in. I'd need
your Signed-off-by for the diff you sent to proceed (and add your
Co-developed-by).

Thanks,
-- Marco

-- >8 --

diff --git a/kernel/events/core.c b/kernel/events/core.c
index aa47e111435e..cea7c88fe131 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2165,8 +2165,9 @@ static void perf_group_detach(struct perf_event *event)
 * If this is a sibling, remove it from its group.
 */
if (leader != event) {
+   leader->nr_siblings--;
list_del_init(>sibling_list);
-   event->group_leader->nr_siblings--;
+   event->group_leader = event;
goto out;
}
 
@@ -2180,8 +2181,9 @@ static void perf_group_detach(struct perf_event *event)
if (sibling->event_caps & PERF_EV_CAP_SIBLING)
perf_remove_sibling_event(sibling);
 
-   sibling->group_leader = sibling;
+   leader->nr_siblings--;
list_del_init(>sibling_list);
+   sibling->group_leader = sibling;
 
/* Inherit group flags from the previous leader */
sibling->group_caps = event->group_caps;
@@ -2358,10 +2360,19 @@ __perf_remove_from_context(struct perf_event *event,
 static void perf_remove_from_context(struct perf_event *event, unsigned long 
flags)
 {
struct perf_event_context *ctx = event->ctx;
+   bool remove;
 
lockdep_assert_held(>mutex);
 
-   event_function_call(event, __perf_remove_from_context, (void *)flags);
+   /*
+* There is concurrency vs remove_on_exec().
+*/
+   raw_spin_lock_irq(>lock);
+   remove = (event->attach_state & PERF_ATTACH_CONTEXT);
+   raw_spin_unlock_irq(>lock);
+
+   if (remove)
+   event_function_call(event, __perf_remove_from_context, (void 
*)flags);
 
/*
 * The above event_function_call() can NO-OP when it hits
@@ -4198,41 +4209,68 @@ static void perf_event_enable_on_exec(int ctxn)
 static void perf_remove_from_owner(struct perf_event *event);
 static void perf_event_exit_event(struct perf_event *child_event,
  struct perf_event_context *child_ctx,
- struct task_struct *child);
+ struct task_struct *child,
+ bool removed);
 
 /*
  * Removes all events from the current task that have been marked
  * remove-on-exec, and feeds their values back to parent events.
  */
-static void perf_event_remove_on_exec(void)
+static void perf_event_remove_on_exec(int ctxn)
 {
-   int ctxn;
+   struct perf_event_context *ctx, *clone_ctx = NULL;
+   struct perf_event *event, *next;
+   LIST_HEAD(free_list);
+   unsigned long flags;
+   bool modified = false;
 
-   for_each_task_context_nr(ctxn) {
-   struct perf_event_context *ctx;
-   struct perf_event *event, *next;
+   ctx = perf_pin_task_context(current, ctxn);
+   if (!ctx)
+   return;
 
-   ctx = perf_pin_task_context(current, ctxn);
-   if (!ctx)
+   mutex_lock(>mutex);
+
+   if (WARN_ON_ONCE(ctx->task != current))
+   goto unlock;
+
+   list_for_each_entry_safe(event, next, >event_list, event_entry) {
+   struct perf_event *parent_event = event->parent;
+
+   if (!event->attr.remove_on_exec)
continue;
-   mutex_lock(>mutex);
 
-   list_for_each_entry_safe(event, nex

  1   2   3   4   5   6   7   8   9   10   >