Re: [PATCH v2 01/13] perf: Ensure perf_guest_cbs aren't reloaded between !NULL check and deref

2021-09-16 Thread Sean Christopherson
On Sat, Aug 28, 2021, Peter Zijlstra wrote:
> On Fri, Aug 27, 2021 at 05:35:46PM -0700, Sean Christopherson wrote:
> > diff --git a/kernel/events/core.c b/kernel/events/core.c
> > index 464917096e73..2126f6327321 100644
> > --- a/kernel/events/core.c
> > +++ b/kernel/events/core.c
> > @@ -6491,14 +6491,19 @@ struct perf_guest_info_callbacks *perf_guest_cbs;
> >  
> >  int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks 
> > *cbs)
> >  {
> > -   perf_guest_cbs = cbs;
> > +   if (WARN_ON_ONCE(perf_guest_cbs))
> > +   return -EBUSY;
> > +
> > +   WRITE_ONCE(perf_guest_cbs, cbs);
> > +   synchronize_rcu();
> 
> You're waiting for all NULL users to go away? :-) IOW, we can do without
> this synchronize_rcu() call.

Doh, right.  I was thinking KVM needed to wait for in-progress NMI to exit to
ensure guest PT interrupts are handled correctly, but obviously the NMI handler
needs to exit for that CPU to get into a guest...

> > return 0;
> >  }
> >  EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
> >  
> >  int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks 
> > *cbs)
> >  {
> > -   perf_guest_cbs = NULL;
> 
>   if (WARN_ON_ONCE(perf_guest_cbs != cbs))
>   return -EBUSY;
> 
> ?

Works for me.  I guess I'm more optimistic about people not being morons :-)



Re: [PATCH v2 01/13] perf: Ensure perf_guest_cbs aren't reloaded between !NULL check and deref

2021-08-28 Thread Peter Zijlstra
On Fri, Aug 27, 2021 at 05:35:46PM -0700, Sean Christopherson wrote:

> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index 2d510ad750ed..6b0405e578c1 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -1237,6 +1237,14 @@ extern void perf_event_bpf_event(struct bpf_prog *prog,
>u16 flags);
>  
>  extern struct perf_guest_info_callbacks *perf_guest_cbs;
> +static inline struct perf_guest_info_callbacks *perf_get_guest_cbs(void)
> +{
> + /* Reg/unreg perf_guest_cbs waits for readers via synchronize_rcu(). */
> + lockdep_assert_preemption_disabled();
> +
> + /* Prevent reloading between a !NULL check and dereferences. */
> + return READ_ONCE(perf_guest_cbs);
> +}

Nice..

>  extern int perf_register_guest_info_callbacks(struct 
> perf_guest_info_callbacks *callbacks);
>  extern int perf_unregister_guest_info_callbacks(struct 
> perf_guest_info_callbacks *callbacks);
>  
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 464917096e73..2126f6327321 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -6491,14 +6491,19 @@ struct perf_guest_info_callbacks *perf_guest_cbs;
>  
>  int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
>  {
> - perf_guest_cbs = cbs;
> + if (WARN_ON_ONCE(perf_guest_cbs))
> + return -EBUSY;
> +
> + WRITE_ONCE(perf_guest_cbs, cbs);
> + synchronize_rcu();

You're waiting for all NULL users to go away? :-) IOW, we can do without
this synchronize_rcu() call.

>   return 0;
>  }
>  EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
>  
>  int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks 
> *cbs)
>  {
> - perf_guest_cbs = NULL;

if (WARN_ON_ONCE(perf_guest_cbs != cbs))
return -EBUSY;

?

> + WRITE_ONCE(perf_guest_cbs, NULL);
> + synchronize_rcu();
>   return 0;
>  }
>  EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);

Yes, this ought to work fine.



[PATCH v2 01/13] perf: Ensure perf_guest_cbs aren't reloaded between !NULL check and deref

2021-08-27 Thread Sean Christopherson
Protect perf_guest_cbs with READ_ONCE/WRITE_ONCE to ensure it's not
reloaded between a !NULL check and a dereference, and wait for all
readers via syncrhonize_rcu() to prevent use-after-free, e.g. if the
callbacks are being unregistered during module unload.  Because the
callbacks are global, it's possible for readers to run in parallel with
an unregister operation.

The bug has escaped notice because all dereferences of perf_guest_cbs
follow the same "perf_guest_cbs && perf_guest_cbs->is_in_guest()" pattern,
and it's extremely unlikely a compiler will reload perf_guest_cbs in this
sequence.  Compilers do reload perf_guest_cbs for future derefs, e.g. for
->is_user_mode(), but the ->is_in_guest() guard all but guarantees the
PMI handler will win the race, e.g. to nullify perf_guest_cbs, KVM has to
completely exit the guest and teardown down all VMs before KVM start its
module unload / unregister sequence.

But with help, unloading kvm_intel can trigger a NULL pointer derference,
e.g. wrapping perf_guest_cbs with READ_ONCE in perf_misc_flags() while
spamming kvm_intel module load/unload leads to:

  BUG: kernel NULL pointer dereference, address: 
  #PF: supervisor read access in kernel mode
  #PF: error_code(0x) - not-present page
  PGD 0 P4D 0
  Oops:  [#1] PREEMPT SMP
  CPU: 6 PID: 1825 Comm: stress Not tainted 5.14.0-rc2+ #459
  Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
  RIP: 0010:perf_misc_flags+0x1c/0x70
  Call Trace:
   perf_prepare_sample+0x53/0x6b0
   perf_event_output_forward+0x67/0x160
   __perf_event_overflow+0x52/0xf0
   handle_pmi_common+0x207/0x300
   intel_pmu_handle_irq+0xcf/0x410
   perf_event_nmi_handler+0x28/0x50
   nmi_handle+0xc7/0x260
   default_do_nmi+0x6b/0x170
   exc_nmi+0x103/0x130
   asm_exc_nmi+0x76/0xbf

Fixes: 39447b386c84 ("perf: Enhance perf to allow for guest statistic 
collection from host")
Cc: sta...@vger.kernel.org
Signed-off-by: Sean Christopherson 
---
 arch/arm/kernel/perf_callchain.c   | 17 +++--
 arch/arm64/kernel/perf_callchain.c | 18 --
 arch/csky/kernel/perf_callchain.c  |  6 --
 arch/nds32/kernel/perf_event_cpu.c | 17 +++--
 arch/riscv/kernel/perf_callchain.c |  7 +--
 arch/x86/events/core.c | 17 +++--
 arch/x86/events/intel/core.c   |  9 ++---
 include/linux/perf_event.h |  8 
 kernel/events/core.c   |  9 +++--
 9 files changed, 75 insertions(+), 33 deletions(-)

diff --git a/arch/arm/kernel/perf_callchain.c b/arch/arm/kernel/perf_callchain.c
index 3b69a76d341e..1626dfc6f6ce 100644
--- a/arch/arm/kernel/perf_callchain.c
+++ b/arch/arm/kernel/perf_callchain.c
@@ -62,9 +62,10 @@ user_backtrace(struct frame_tail __user *tail,
 void
 perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs 
*regs)
 {
+   struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs();
struct frame_tail __user *tail;
 
-   if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+   if (guest_cbs && guest_cbs->is_in_guest()) {
/* We don't support guest os callchain now */
return;
}
@@ -98,9 +99,10 @@ callchain_trace(struct stackframe *fr,
 void
 perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs 
*regs)
 {
+   struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs();
struct stackframe fr;
 
-   if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+   if (guest_cbs && guest_cbs->is_in_guest()) {
/* We don't support guest os callchain now */
return;
}
@@ -111,18 +113,21 @@ perf_callchain_kernel(struct perf_callchain_entry_ctx 
*entry, struct pt_regs *re
 
 unsigned long perf_instruction_pointer(struct pt_regs *regs)
 {
-   if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
-   return perf_guest_cbs->get_guest_ip();
+   struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs();
+
+   if (guest_cbs && guest_cbs->is_in_guest())
+   return guest_cbs->get_guest_ip();
 
return instruction_pointer(regs);
 }
 
 unsigned long perf_misc_flags(struct pt_regs *regs)
 {
+   struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs();
int misc = 0;
 
-   if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
-   if (perf_guest_cbs->is_user_mode())
+   if (guest_cbs && guest_cbs->is_in_guest()) {
+   if (guest_cbs->is_user_mode())
misc |= PERF_RECORD_MISC_GUEST_USER;
else
misc |= PERF_RECORD_MISC_GUEST_KERNEL;
diff --git a/arch/arm64/kernel/perf_callchain.c 
b/arch/arm64/kernel/perf_callchain.c
index 4a72c2727309..86d9f2013172 100644
--- a/arch/arm64/kernel/perf_callchain.c
+++ b/arch/arm64/kernel/perf_callchain.c
@@ -102,7 +102,9 @@ compat_user_backtrace(struct compat_frame_tail __user *tail,