Re: [PATCH v2 04/12] uprobes: revamp uprobe refcounting and lifetime management

2024-07-04 Thread Peter Zijlstra
On Thu, Jul 04, 2024 at 10:03:48AM +0200, Peter Zijlstra wrote:

> diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
> index c98e3b3386ba..4aafb4485be7 100644
> --- a/kernel/trace/trace_uprobe.c
> +++ b/kernel/trace/trace_uprobe.c
> @@ -1112,7 +1112,8 @@ static void __probe_event_disable(struct trace_probe 
> *tp)
>   if (!tu->inode)
>   continue;
>  
> - uprobe_unregister(tu->inode, tu->offset, >consumer);
> + uprobe_unregister(tu->inode, tu->offset, >consumer,
> +   list_is_last(trace_probe_probe_list(tp), 
> >tp.list) ? 0 : URF_NO_SYNC);
>   tu->inode = NULL;
>   }
>  }


Hmm, that continue clause might ruin things. Still easy enough to add
uprobe_unregister_sync() and simpy always pass URF_NO_SYNC.

I really don't see why we should make this more complicated than it
needs to be.

diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index 354cab634341..681741a51df3 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -115,7 +115,9 @@ extern int uprobe_write_opcode(struct arch_uprobe *auprobe, 
struct mm_struct *mm
 extern int uprobe_register(struct inode *inode, loff_t offset, struct 
uprobe_consumer *uc);
 extern int uprobe_register_refctr(struct inode *inode, loff_t offset, loff_t 
ref_ctr_offset, struct uprobe_consumer *uc);
 extern int uprobe_apply(struct inode *inode, loff_t offset, struct 
uprobe_consumer *uc, bool);
-extern void uprobe_unregister(struct inode *inode, loff_t offset, struct 
uprobe_consumer *uc);
+#define URF_NO_SYNC0x01
+extern void uprobe_unregister(struct inode *inode, loff_t offset, struct 
uprobe_consumer *uc, unsigned int flags);
+extern void uprobe_unregister_sync(void);
 extern int uprobe_mmap(struct vm_area_struct *vma);
 extern void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, 
unsigned long end);
 extern void uprobe_start_dup_mmap(void);
@@ -165,7 +167,7 @@ uprobe_apply(struct inode *inode, loff_t offset, struct 
uprobe_consumer *uc, boo
return -ENOSYS;
 }
 static inline void
-uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer 
*uc)
+uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer 
*uc, unsigned int flags)
 {
 }
 static inline int uprobe_mmap(struct vm_area_struct *vma)
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 0b7574a54093..d09f7b942076 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1145,7 +1145,7 @@ __uprobe_unregister(struct uprobe *uprobe, struct 
uprobe_consumer *uc)
  * @offset: offset from the start of the file.
  * @uc: identify which probe if multiple probes are colocated.
  */
-void uprobe_unregister(struct inode *inode, loff_t offset, struct 
uprobe_consumer *uc)
+void uprobe_unregister(struct inode *inode, loff_t offset, struct 
uprobe_consumer *uc, unsigned int flags)
 {
scoped_guard (srcu, _srcu) {
struct uprobe *uprobe = find_uprobe(inode, offset);
@@ -1157,10 +1157,17 @@ void uprobe_unregister(struct inode *inode, loff_t 
offset, struct uprobe_consume
mutex_unlock(>register_mutex);
}
 
-   synchronize_srcu(_srcu); // XXX amortize / batch
+   if (!(flags & URF_NO_SYNC))
+   synchronize_srcu(_srcu);
 }
 EXPORT_SYMBOL_GPL(uprobe_unregister);
 
+void uprobe_unregister_sync(void)
+{
+   synchronize_srcu(_srcu);
+}
+EXPORT_SYMBOL_GPL(uprobe_unregister_sync);
+
 /*
  * __uprobe_register - register a probe
  * @inode: the file in which the probe has to be placed.
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index d1daeab1bbc1..1f6adabbb1e7 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -3181,9 +3181,10 @@ static void bpf_uprobe_unregister(struct path *path, 
struct bpf_uprobe *uprobes,
u32 i;
 
for (i = 0; i < cnt; i++) {
-   uprobe_unregister(d_real_inode(path->dentry), uprobes[i].offset,
- [i].consumer);
+   uprobe_unregister(d_real_inode(path->dentry), 
uprobes[i].offset, URF_NO_SYNC);
}
+   if (cnt > 0)
+   uprobe_unregister_sync();
 }
 
 static void bpf_uprobe_multi_link_release(struct bpf_link *link)
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index c98e3b3386ba..6b64470a1c5c 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1104,6 +1104,7 @@ static int trace_uprobe_enable(struct trace_uprobe *tu, 
filter_func_t filter)
 static void __probe_event_disable(struct trace_probe *tp)
 {
struct trace_uprobe *tu;
+   bool sync = false;
 
tu = container_of(tp, struct trace_uprobe, tp);
WARN_ON(!uprobe_filter_is_empty(tu->tp.event->filter));
@@ -1112,9 +1113,12 @@ static void __probe_ev

Re: [PATCH v2 04/12] uprobes: revamp uprobe refcounting and lifetime management

2024-07-04 Thread Peter Zijlstra
On Wed, Jul 03, 2024 at 01:47:23PM -0700, Andrii Nakryiko wrote:

> > When I cobble all that together (it really shouldn't be one patch, but
> > you get the idea I hope) it looks a little something like the below.
> >
> > I *think* it should work, but perhaps I've missed something?
> 
> Well, at the very least you missed that we can't delay SRCU (or any
> other sleepable RCU flavor) potentially indefinitely for uretprobes,
> which are completely under user space control.

Sure, but that's fixable. You can work around that by having (u)tasks
with a non-empty return_instance list carry a timer. When/if that timer
fires, it goes and converts the SRCU references to actual references.

Not so very hard to do, but very much not needed for a PoC.



Re: [PATCH v2 04/12] uprobes: revamp uprobe refcounting and lifetime management

2024-07-04 Thread Peter Zijlstra
On Wed, Jul 03, 2024 at 01:47:23PM -0700, Andrii Nakryiko wrote:
> Your innocuous "// XXX amortize / batch" comment below is *the major
> point of this patch set*. Try to appreciate that. It's not a small
> todo, it took this entire patch set to allow for that.

Tada!

diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index 354cab634341..c9c9ec87ab9a 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -115,7 +115,8 @@ extern int uprobe_write_opcode(struct arch_uprobe *auprobe, 
struct mm_struct *mm
 extern int uprobe_register(struct inode *inode, loff_t offset, struct 
uprobe_consumer *uc);
 extern int uprobe_register_refctr(struct inode *inode, loff_t offset, loff_t 
ref_ctr_offset, struct uprobe_consumer *uc);
 extern int uprobe_apply(struct inode *inode, loff_t offset, struct 
uprobe_consumer *uc, bool);
-extern void uprobe_unregister(struct inode *inode, loff_t offset, struct 
uprobe_consumer *uc);
+#define URF_NO_SYNC0x01
+extern void uprobe_unregister(struct inode *inode, loff_t offset, struct 
uprobe_consumer *uc, unsigned int flags);
 extern int uprobe_mmap(struct vm_area_struct *vma);
 extern void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, 
unsigned long end);
 extern void uprobe_start_dup_mmap(void);
@@ -165,7 +166,7 @@ uprobe_apply(struct inode *inode, loff_t offset, struct 
uprobe_consumer *uc, boo
return -ENOSYS;
 }
 static inline void
-uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer 
*uc)
+uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer 
*uc, unsigned int flags)
 {
 }
 static inline int uprobe_mmap(struct vm_area_struct *vma)
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 0b7574a54093..1f4151c518ed 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1145,7 +1145,7 @@ __uprobe_unregister(struct uprobe *uprobe, struct 
uprobe_consumer *uc)
  * @offset: offset from the start of the file.
  * @uc: identify which probe if multiple probes are colocated.
  */
-void uprobe_unregister(struct inode *inode, loff_t offset, struct 
uprobe_consumer *uc)
+void uprobe_unregister(struct inode *inode, loff_t offset, struct 
uprobe_consumer *uc, unsigned int flags)
 {
scoped_guard (srcu, _srcu) {
struct uprobe *uprobe = find_uprobe(inode, offset);
@@ -1157,7 +1157,8 @@ void uprobe_unregister(struct inode *inode, loff_t 
offset, struct uprobe_consume
mutex_unlock(>register_mutex);
}
 
-   synchronize_srcu(_srcu); // XXX amortize / batch
+   if (!(flags & URF_NO_SYNC))
+   synchronize_srcu(_srcu);
 }
 EXPORT_SYMBOL_GPL(uprobe_unregister);
 
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index d1daeab1bbc1..950b5241244a 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -3182,7 +3182,7 @@ static void bpf_uprobe_unregister(struct path *path, 
struct bpf_uprobe *uprobes,
 
for (i = 0; i < cnt; i++) {
uprobe_unregister(d_real_inode(path->dentry), uprobes[i].offset,
- [i].consumer);
+ [i].consumer, i != cnt-1 ? 
URF_NO_SYNC : 0);
}
 }
 
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index c98e3b3386ba..4aafb4485be7 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1112,7 +1112,8 @@ static void __probe_event_disable(struct trace_probe *tp)
if (!tu->inode)
continue;
 
-   uprobe_unregister(tu->inode, tu->offset, >consumer);
+   uprobe_unregister(tu->inode, tu->offset, >consumer,
+ list_is_last(trace_probe_probe_list(tp), 
>tp.list) ? 0 : URF_NO_SYNC);
tu->inode = NULL;
}
 }



Re: [PATCH v2 04/12] uprobes: revamp uprobe refcounting and lifetime management

2024-07-03 Thread Peter Zijlstra
On Mon, Jul 01, 2024 at 03:39:27PM -0700, Andrii Nakryiko wrote:

> One, attempted initially, way to solve this is through using
> atomic_inc_not_zero() approach, turning get_uprobe() into
> try_get_uprobe(),

This is the canonical thing to do. Everybody does this.

> which can fail to bump refcount if uprobe is already
> destined to be destroyed. This, unfortunately, turns out to be a rather
> expensive due to underlying cmpxchg() operation in
> atomic_inc_not_zero() and scales rather poorly with increased amount of
> parallel threads triggering uprobes.

Different archs different trade-offs. You'll not see this on LL/SC archs
for example.

> Furthermore, CPU profiling showed the following overall CPU usage:
>   - try_get_uprobe (19.3%) + put_uprobe (8.2%) = 27.5% CPU usage for
> atomic_inc_not_zero approach;
>   - __get_uprobe (12.3%) + put_uprobe (9.9%) = 22.2% CPU usage for
> atomic_add_and_return approach implemented by this patch.

I think those numbers suggest trying to not have a refcount in the first
place. Both are pretty terrible, yes one is less terrible than the
other, but still terrible.

Specifically, I'm thinking it is the refcounting in handlw_swbp() that
is actually the problem, all the other stuff is noise. 

So if you have SRCU protected consumers, what is the reason for still
having a refcount in handlw_swbp() ? Simply have the whole of it inside
a single SRCU critical section, then all consumers you find get a hit.

Hmm, return probes are a pain, they require the uprobe to stay extant
between handle_swbp() and handle_trampoline(). I'm thinking we can do
that with SRCU as well.

When I cobble all that together (it really shouldn't be one patch, but
you get the idea I hope) it looks a little something like the below.

I *think* it should work, but perhaps I've missed something?

TL;DR replace treelock with seqcount+SRCU
  replace register_rwsem with SRCU
  replace handle_swbp() refcount with SRCU
  replace return_instance refcount with a second SRCU

Paul, I had to do something vile with SRCU. The basic problem is that we
want to keep a SRCU critical section across fork(), which leads to both
parent and child doing srcu_read_unlock(, idx). As such, I need an
extra increment on the @idx ssp counter to even things out, see
__srcu_read_clone_lock().

---
 include/linux/rbtree.h  |  45 +
 include/linux/srcu.h|   2 +
 include/linux/uprobes.h |   2 +
 kernel/events/uprobes.c | 166 +++-
 kernel/rcu/srcutree.c   |   5 ++
 5 files changed, 161 insertions(+), 59 deletions(-)

diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
index f7edca369eda..9847fa58a287 100644
--- a/include/linux/rbtree.h
+++ b/include/linux/rbtree.h
@@ -244,6 +244,31 @@ rb_find_add(struct rb_node *node, struct rb_root *tree,
return NULL;
 }
 
+static __always_inline struct rb_node *
+rb_find_add_rcu(struct rb_node *node, struct rb_root *tree,
+   int (*cmp)(struct rb_node *, const struct rb_node *))
+{
+   struct rb_node **link = >rb_node;
+   struct rb_node *parent = NULL;
+   int c;
+
+   while (*link) {
+   parent = *link;
+   c = cmp(node, parent);
+
+   if (c < 0)
+   link = >rb_left;
+   else if (c > 0)
+   link = >rb_right;
+   else
+   return parent;
+   }
+
+   rb_link_node_rcu(node, parent, link);
+   rb_insert_color(node, tree);
+   return NULL;
+}
+
 /**
  * rb_find() - find @key in tree @tree
  * @key: key to match
@@ -272,6 +297,26 @@ rb_find(const void *key, const struct rb_root *tree,
return NULL;
 }
 
+static __always_inline struct rb_node *
+rb_find_rcu(const void *key, const struct rb_root *tree,
+   int (*cmp)(const void *key, const struct rb_node *))
+{
+   struct rb_node *node = tree->rb_node;
+
+   while (node) {
+   int c = cmp(key, node);
+
+   if (c < 0)
+   node = rcu_dereference_raw(node->rb_left);
+   else if (c > 0)
+   node = rcu_dereference_raw(node->rb_right);
+   else
+   return node;
+   }
+
+   return NULL;
+}
+
 /**
  * rb_find_first() - find the first @key in @tree
  * @key: key to match
diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 236610e4a8fa..9b14acecbb9d 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -55,7 +55,9 @@ void call_srcu(struct srcu_struct *ssp, struct rcu_head *head,
void (*func)(struct rcu_head *head));
 void cleanup_srcu_struct(struct srcu_struct *ssp);
 int __srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp);
+void __srcu_read_clone_lock(struct srcu_struct *ssp, int idx);
 void __srcu_read_unlock(struct srcu_struct *ssp, int idx) __releases(ssp);
+
 void synchronize_srcu(struct srcu_struct *ssp);
 unsigned long 

Re: [PATCH v2 05/12] uprobes: move offset and ref_ctr_offset into uprobe_consumer

2024-07-03 Thread Peter Zijlstra
On Mon, Jul 01, 2024 at 03:39:28PM -0700, Andrii Nakryiko wrote:
> Simplify uprobe registration/unregistration interfaces by making offset
> and ref_ctr_offset part of uprobe_consumer "interface". In practice, all
> existing users already store these fields somewhere in uprobe_consumer's
> containing structure, so this doesn't pose any problem. We just move
> some fields around.
> 
> On the other hand, this simplifies uprobe_register() and
> uprobe_unregister() API by having only struct uprobe_consumer as one
> thing representing attachment/detachment entity. This makes batched
> versions of uprobe_register() and uprobe_unregister() simpler.
> 
> This also makes uprobe_register_refctr() unnecessary, so remove it and
> simplify consumers.
> 
> No functional changes intended.
> 
> Acked-by: Masami Hiramatsu (Google) 
> Signed-off-by: Andrii Nakryiko 
> ---
>  include/linux/uprobes.h   | 18 +++
>  kernel/events/uprobes.c   | 19 ++-
>  kernel/trace/bpf_trace.c  | 21 +++-
>  kernel/trace/trace_uprobe.c   | 53 ---
>  .../selftests/bpf/bpf_testmod/bpf_testmod.c   | 22 
>  5 files changed, 55 insertions(+), 78 deletions(-)
> 
> diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
> index b503fafb7fb3..a75ba37ce3c8 100644
> --- a/include/linux/uprobes.h
> +++ b/include/linux/uprobes.h
> @@ -42,6 +42,11 @@ struct uprobe_consumer {
>   enum uprobe_filter_ctx ctx,
>   struct mm_struct *mm);
>  
> + /* associated file offset of this probe */
> + loff_t offset;
> + /* associated refctr file offset of this probe, or zero */
> + loff_t ref_ctr_offset;
> + /* for internal uprobe infra use, consumers shouldn't touch fields 
> below */
>   struct uprobe_consumer *next;
>  };
>  
> @@ -110,10 +115,9 @@ extern bool is_trap_insn(uprobe_opcode_t *insn);
>  extern unsigned long uprobe_get_swbp_addr(struct pt_regs *regs);
>  extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs);
>  extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct 
> *mm, unsigned long vaddr, uprobe_opcode_t);
> -extern int uprobe_register(struct inode *inode, loff_t offset, struct 
> uprobe_consumer *uc);
> -extern int uprobe_register_refctr(struct inode *inode, loff_t offset, loff_t 
> ref_ctr_offset, struct uprobe_consumer *uc);
> +extern int uprobe_register(struct inode *inode, struct uprobe_consumer *uc);
>  extern int uprobe_apply(struct inode *inode, loff_t offset, struct 
> uprobe_consumer *uc, bool);
> -extern void uprobe_unregister(struct inode *inode, loff_t offset, struct 
> uprobe_consumer *uc);
> +extern void uprobe_unregister(struct inode *inode, struct uprobe_consumer 
> *uc);

It seems very weird and unnatural to split inode and offset like this.
The whole offset thing only makes sense within the context of an inode.

So yeah, lets not do this.



Re: [PATCH v2 00/12] uprobes: add batched register/unregister APIs and per-CPU RW semaphore

2024-07-02 Thread Peter Zijlstra
On Mon, Jul 01, 2024 at 03:39:23PM -0700, Andrii Nakryiko wrote:
> This patch set, ultimately, switches global uprobes_treelock from RW spinlock
> to per-CPU RW semaphore, which has better performance and scales better under
> contention and multiple parallel threads triggering lots of uprobes.

Why not RCU + normal lock thing?



Re: [PATCH v2 04/12] uprobes: revamp uprobe refcounting and lifetime management

2024-07-02 Thread Peter Zijlstra
On Mon, Jul 01, 2024 at 03:39:27PM -0700, Andrii Nakryiko wrote:

> diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> index 23449a8c5e7e..560cf1ca512a 100644
> --- a/kernel/events/uprobes.c
> +++ b/kernel/events/uprobes.c
> @@ -53,9 +53,10 @@ DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem);
>  
>  struct uprobe {
>   struct rb_node  rb_node;/* node in the rb tree */
> - refcount_t  ref;
> + atomic64_t  ref;/* see UPROBE_REFCNT_GET below 
> */
>   struct rw_semaphore register_rwsem;
>   struct rw_semaphore consumer_rwsem;
> + struct rcu_head rcu;
>   struct list_headpending_list;
>   struct uprobe_consumer  *consumers;
>   struct inode*inode; /* Also hold a ref to inode */
> @@ -587,15 +588,138 @@ set_orig_insn(struct arch_uprobe *auprobe, struct 
> mm_struct *mm, unsigned long v
>   *(uprobe_opcode_t *)>insn);
>  }
>  
> -static struct uprobe *get_uprobe(struct uprobe *uprobe)
> +/*
> + * Uprobe's 64-bit refcount is actually two independent counters co-located 
> in
> + * a single u64 value:
> + *   - lower 32 bits are just a normal refcount with is increment and
> + *   decremented on get and put, respectively, just like normal refcount
> + *   would;
> + *   - upper 32 bits are a tag (or epoch, if you will), which is always
> + *   incremented by one, no matter whether get or put operation is done.
> + *
> + * This upper counter is meant to distinguish between:
> + *   - one CPU dropping refcnt from 1 -> 0 and proceeding with "destruction",
> + *   - while another CPU continuing further meanwhile with 0 -> 1 -> 0 refcnt
> + *   sequence, also proceeding to "destruction".
> + *
> + * In both cases refcount drops to zero, but in one case it will have epoch 
> N,
> + * while the second drop to zero will have a different epoch N + 2, allowing
> + * first destructor to bail out because epoch changed between refcount going
> + * to zero and put_uprobe() taking uprobes_treelock (under which overall
> + * 64-bit refcount is double-checked, see put_uprobe() for details).
> + *
> + * Lower 32-bit counter is not meant to over overflow, while it's expected

So refcount_t very explicitly handles both overflow and underflow and
screams bloody murder if they happen. Your thing does not.. 

> + * that upper 32-bit counter will overflow occasionally. Note, though, that 
> we
> + * can't allow upper 32-bit counter to "bleed over" into lower 32-bit 
> counter,
> + * so whenever epoch counter gets highest bit set to 1, __get_uprobe() and
> + * put_uprobe() will attempt to clear upper bit with cmpxchg(). This makes
> + * epoch effectively a 31-bit counter with highest bit used as a flag to
> + * perform a fix-up. This ensures epoch and refcnt parts do not "interfere".
> + *
> + * UPROBE_REFCNT_GET constant is chosen such that it will *increment both*
> + * epoch and refcnt parts atomically with one atomic_add().
> + * UPROBE_REFCNT_PUT is chosen such that it will *decrement* refcnt part and
> + * *increment* epoch part.
> + */
> +#define UPROBE_REFCNT_GET ((1LL << 32) + 1LL) /* 0x00010001LL */
> +#define UPROBE_REFCNT_PUT ((1LL << 32) - 1LL) /* 0xLL */
> +
> +/*
> + * Caller has to make sure that:
> + *   a) either uprobe's refcnt is positive before this call;
> + *   b) or uprobes_treelock is held (doesn't matter if for read or write),
> + *  preventing uprobe's destructor from removing it from uprobes_tree.
> + *
> + * In the latter case, uprobe's destructor will "resurrect" uprobe instance 
> if
> + * it detects that its refcount went back to being positive again inbetween 
> it
> + * dropping to zero at some point and (potentially delayed) destructor
> + * callback actually running.
> + */
> +static struct uprobe *__get_uprobe(struct uprobe *uprobe)
>  {
> - refcount_inc(>ref);
> + s64 v;
> +
> + v = atomic64_add_return(UPROBE_REFCNT_GET, >ref);

Distinct lack of u32 overflow testing here..

> +
> + /*
> +  * If the highest bit is set, we need to clear it. If cmpxchg() fails,
> +  * we don't retry because there is another CPU that just managed to
> +  * update refcnt and will attempt the same "fix up". Eventually one of
> +  * them will succeed to clear highset bit.
> +  */
> + if (unlikely(v < 0))
> + (void)atomic64_cmpxchg(>ref, v, v & ~(1ULL << 63));
> +
>   return uprobe;
>  }

>  static void put_uprobe(struct uprobe *uprobe)
>  {
> - if (refcount_dec_and_test(>ref)) {
> + s64 v;
> +
> + /*
> +  * here uprobe instance is guaranteed to be alive, so we use Tasks
> +  * Trace RCU to guarantee that uprobe won't be freed from under us, if

What's wrong with normal RCU?

> +  * we end up being a losing "destructor" inside uprobe_treelock'ed
> +  * section double-checking uprobe->ref value below.
> +  * Note call_rcu_tasks_trace() + uprobe_free_rcu 

Re: [PATCH 2/4] perf,uprobes: fix user stack traces in the presence of pending uretprobes

2024-05-15 Thread Peter Zijlstra
On Wed, May 08, 2024 at 02:26:03PM -0700, Andrii Nakryiko wrote:

> +static void fixup_uretprobe_trampoline_entries(struct perf_callchain_entry 
> *entry,
> +int start_entry_idx)
> +{
> +#ifdef CONFIG_UPROBES
> + struct uprobe_task *utask = current->utask;
> + struct return_instance *ri;
> + __u64 *cur_ip, *last_ip, tramp_addr;
> +
> + if (likely(!utask || !utask->return_instances))
> + return;
> +
> + cur_ip = >ip[start_entry_idx];
> + last_ip = >ip[entry->nr - 1];
> + ri = utask->return_instances;
> + tramp_addr = uprobe_get_trampoline_vaddr();
> +
> + /* If there are pending uretprobes for current thread, they are

Comment style fail. Also 'for *the* current thread'.

> +  * recorded in a list inside utask->return_instances; each such
> +  * pending uretprobe replaces traced user function's return address on
> +  * the stack, so when stack trace is captured, instead of seeing
> +  * actual function's return address, we'll have one or many uretprobe
> +  * trampoline addresses in the stack trace, which are not helpful and
> +  * misleading to users.

I would beg to differ, what if the uprobe is causing the performance
issue?

While I do think it makes sense to fix the unwind in the sense that we
should be able to continue the unwind, I don't think it makes sense to
completely hide the presence of uprobes.

> +  * So here we go over the pending list of uretprobes, and each
> +  * encountered trampoline address is replaced with actual return
> +  * address.
> +  */
> + while (ri && cur_ip <= last_ip) {
> + if (*cur_ip == tramp_addr) {
> + *cur_ip = ri->orig_ret_vaddr;
> + ri = ri->next;
> + }
> + cur_ip++;
> + }
> +#endif
> +}