On Mon, Jul 01, 2024 at 03:39:27PM -0700, Andrii Nakryiko wrote:

> diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> index 23449a8c5e7e..560cf1ca512a 100644
> --- a/kernel/events/uprobes.c
> +++ b/kernel/events/uprobes.c
> @@ -53,9 +53,10 @@ DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem);
>  
>  struct uprobe {
>       struct rb_node          rb_node;        /* node in the rb tree */
> -     refcount_t              ref;
> +     atomic64_t              ref;            /* see UPROBE_REFCNT_GET below 
> */
>       struct rw_semaphore     register_rwsem;
>       struct rw_semaphore     consumer_rwsem;
> +     struct rcu_head         rcu;
>       struct list_head        pending_list;
>       struct uprobe_consumer  *consumers;
>       struct inode            *inode;         /* Also hold a ref to inode */
> @@ -587,15 +588,138 @@ set_orig_insn(struct arch_uprobe *auprobe, struct 
> mm_struct *mm, unsigned long v
>                       *(uprobe_opcode_t *)&auprobe->insn);
>  }
>  
> -static struct uprobe *get_uprobe(struct uprobe *uprobe)
> +/*
> + * Uprobe's 64-bit refcount is actually two independent counters co-located 
> in
> + * a single u64 value:
> + *   - lower 32 bits are just a normal refcount with is increment and
> + *   decremented on get and put, respectively, just like normal refcount
> + *   would;
> + *   - upper 32 bits are a tag (or epoch, if you will), which is always
> + *   incremented by one, no matter whether get or put operation is done.
> + *
> + * This upper counter is meant to distinguish between:
> + *   - one CPU dropping refcnt from 1 -> 0 and proceeding with "destruction",
> + *   - while another CPU continuing further meanwhile with 0 -> 1 -> 0 refcnt
> + *   sequence, also proceeding to "destruction".
> + *
> + * In both cases refcount drops to zero, but in one case it will have epoch 
> N,
> + * while the second drop to zero will have a different epoch N + 2, allowing
> + * first destructor to bail out because epoch changed between refcount going
> + * to zero and put_uprobe() taking uprobes_treelock (under which overall
> + * 64-bit refcount is double-checked, see put_uprobe() for details).
> + *
> + * Lower 32-bit counter is not meant to over overflow, while it's expected

So refcount_t very explicitly handles both overflow and underflow and
screams bloody murder if they happen. Your thing does not.. 

> + * that upper 32-bit counter will overflow occasionally. Note, though, that 
> we
> + * can't allow upper 32-bit counter to "bleed over" into lower 32-bit 
> counter,
> + * so whenever epoch counter gets highest bit set to 1, __get_uprobe() and
> + * put_uprobe() will attempt to clear upper bit with cmpxchg(). This makes
> + * epoch effectively a 31-bit counter with highest bit used as a flag to
> + * perform a fix-up. This ensures epoch and refcnt parts do not "interfere".
> + *
> + * UPROBE_REFCNT_GET constant is chosen such that it will *increment both*
> + * epoch and refcnt parts atomically with one atomic_add().
> + * UPROBE_REFCNT_PUT is chosen such that it will *decrement* refcnt part and
> + * *increment* epoch part.
> + */
> +#define UPROBE_REFCNT_GET ((1LL << 32) + 1LL) /* 0x0000000100000001LL */
> +#define UPROBE_REFCNT_PUT ((1LL << 32) - 1LL) /* 0x00000000ffffffffLL */
> +
> +/*
> + * Caller has to make sure that:
> + *   a) either uprobe's refcnt is positive before this call;
> + *   b) or uprobes_treelock is held (doesn't matter if for read or write),
> + *      preventing uprobe's destructor from removing it from uprobes_tree.
> + *
> + * In the latter case, uprobe's destructor will "resurrect" uprobe instance 
> if
> + * it detects that its refcount went back to being positive again inbetween 
> it
> + * dropping to zero at some point and (potentially delayed) destructor
> + * callback actually running.
> + */
> +static struct uprobe *__get_uprobe(struct uprobe *uprobe)
>  {
> -     refcount_inc(&uprobe->ref);
> +     s64 v;
> +
> +     v = atomic64_add_return(UPROBE_REFCNT_GET, &uprobe->ref);

Distinct lack of u32 overflow testing here..

> +
> +     /*
> +      * If the highest bit is set, we need to clear it. If cmpxchg() fails,
> +      * we don't retry because there is another CPU that just managed to
> +      * update refcnt and will attempt the same "fix up". Eventually one of
> +      * them will succeed to clear highset bit.
> +      */
> +     if (unlikely(v < 0))
> +             (void)atomic64_cmpxchg(&uprobe->ref, v, v & ~(1ULL << 63));
> +
>       return uprobe;
>  }

>  static void put_uprobe(struct uprobe *uprobe)
>  {
> -     if (refcount_dec_and_test(&uprobe->ref)) {
> +     s64 v;
> +
> +     /*
> +      * here uprobe instance is guaranteed to be alive, so we use Tasks
> +      * Trace RCU to guarantee that uprobe won't be freed from under us, if

What's wrong with normal RCU?

> +      * we end up being a losing "destructor" inside uprobe_treelock'ed
> +      * section double-checking uprobe->ref value below.
> +      * Note call_rcu_tasks_trace() + uprobe_free_rcu below.
> +      */
> +     rcu_read_lock_trace();
> +
> +     v = atomic64_add_return(UPROBE_REFCNT_PUT, &uprobe->ref);

No underflow handling... because nobody ever had a double put bug.

> +     if (unlikely((u32)v == 0)) {
> +             bool destroy;
> +
> +             write_lock(&uprobes_treelock);
> +             /*
> +              * We might race with find_uprobe()->__get_uprobe() executed
> +              * from inside read-locked uprobes_treelock, which can bump
> +              * refcount from zero back to one, after we got here. Even
> +              * worse, it's possible for another CPU to do 0 -> 1 -> 0
> +              * transition between this CPU doing atomic_add() and taking
> +              * uprobes_treelock. In either case this CPU should bail out
> +              * and not proceed with destruction.
> +              *
> +              * So now that we have exclusive write lock, we double check
> +              * the total 64-bit refcount value, which includes the epoch.
> +              * If nothing changed (i.e., epoch is the same and refcnt is
> +              * still zero), we are good and we proceed with the clean up.
> +              *
> +              * But if it managed to be updated back at least once, we just
> +              * pretend it never went to zero. If lower 32-bit refcnt part
> +              * drops to zero again, another CPU will proceed with
> +              * destruction, due to more up to date epoch.
> +              */
> +             destroy = atomic64_read(&uprobe->ref) == v;
> +             if (destroy && uprobe_is_active(uprobe))
> +                     rb_erase(&uprobe->rb_node, &uprobes_tree);
> +             write_unlock(&uprobes_treelock);
> +
> +             /*
> +              * Beyond here we don't need RCU protection, we are either the
> +              * winning destructor and we control the rest of uprobe's
> +              * lifetime; or we lost and we are bailing without accessing
> +              * uprobe fields anymore.
> +              */
> +             rcu_read_unlock_trace();
> +
> +             /* uprobe got resurrected, pretend we never tried to free it */
> +             if (!destroy)
> +                     return;
> +
>               /*
>                * If application munmap(exec_vma) before uprobe_unregister()
>                * gets called, we don't get a chance to remove uprobe from
> @@ -604,8 +728,21 @@ static void put_uprobe(struct uprobe *uprobe)
>               mutex_lock(&delayed_uprobe_lock);
>               delayed_uprobe_remove(uprobe, NULL);
>               mutex_unlock(&delayed_uprobe_lock);
> -             kfree(uprobe);
> +
> +             call_rcu_tasks_trace(&uprobe->rcu, uprobe_free_rcu);
> +             return;
>       }
> +
> +     /*
> +      * If the highest bit is set, we need to clear it. If cmpxchg() fails,
> +      * we don't retry because there is another CPU that just managed to
> +      * update refcnt and will attempt the same "fix up". Eventually one of
> +      * them will succeed to clear highset bit.
> +      */
> +     if (unlikely(v < 0))
> +             (void)atomic64_cmpxchg(&uprobe->ref, v, v & ~(1ULL << 63));
> +
> +     rcu_read_unlock_trace();
>  }

Reply via email to