To avoid unnecessarily taking a (brief) refcount on uprobe during
breakpoint handling in handle_swbp for entry uprobes, make find_uprobe()
not take refcount, but protect the lifetime of a uprobe instance with
RCU. This improves scalability, as refcount gets quite expensive due to
cache line bouncing between multiple CPUs.

Specifically, we utilize our own uprobe-specific SRCU instance for this
RCU protection. put_uprobe() will delay actual kfree() using call_srcu().

For now, uretprobe and single-stepping handling will still acquire
refcount as necessary. We'll address these issues in follow up patches
by making them use SRCU with timeout.

Signed-off-by: Andrii Nakryiko <and...@kernel.org>
---
 kernel/events/uprobes.c | 93 ++++++++++++++++++++++++-----------------
 1 file changed, 55 insertions(+), 38 deletions(-)

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 23dde3ec5b09..6d5c3f4b210f 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -41,6 +41,8 @@ static struct rb_root uprobes_tree = RB_ROOT;
 
 static DEFINE_RWLOCK(uprobes_treelock);        /* serialize rbtree access */
 
+DEFINE_STATIC_SRCU(uprobes_srcu);
+
 #define UPROBES_HASH_SZ        13
 /* serialize uprobe->pending_list */
 static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
@@ -59,6 +61,7 @@ struct uprobe {
        struct list_head        pending_list;
        struct uprobe_consumer  *consumers;
        struct inode            *inode;         /* Also hold a ref to inode */
+       struct rcu_head         rcu;
        loff_t                  offset;
        loff_t                  ref_ctr_offset;
        unsigned long           flags;
@@ -612,6 +615,13 @@ static inline bool uprobe_is_active(struct uprobe *uprobe)
        return !RB_EMPTY_NODE(&uprobe->rb_node);
 }
 
+static void uprobe_free_rcu(struct rcu_head *rcu)
+{
+       struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu);
+
+       kfree(uprobe);
+}
+
 static void put_uprobe(struct uprobe *uprobe)
 {
        if (!refcount_dec_and_test(&uprobe->ref))
@@ -632,6 +642,8 @@ static void put_uprobe(struct uprobe *uprobe)
        mutex_lock(&delayed_uprobe_lock);
        delayed_uprobe_remove(uprobe, NULL);
        mutex_unlock(&delayed_uprobe_lock);
+
+       call_srcu(&uprobes_srcu, &uprobe->rcu, uprobe_free_rcu);
 }
 
 static __always_inline
@@ -673,33 +685,25 @@ static inline int __uprobe_cmp(struct rb_node *a, const 
struct rb_node *b)
        return uprobe_cmp(u->inode, u->offset, __node_2_uprobe(b));
 }
 
-static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset)
+/*
+ * Assumes being inside RCU protected region.
+ * No refcount is taken on returned uprobe.
+ */
+static struct uprobe *find_uprobe_rcu(struct inode *inode, loff_t offset)
 {
        struct __uprobe_key key = {
                .inode = inode,
                .offset = offset,
        };
-       struct rb_node *node = rb_find(&key, &uprobes_tree, __uprobe_cmp_key);
-
-       if (node)
-               return try_get_uprobe(__node_2_uprobe(node));
+       struct rb_node *node;
 
-       return NULL;
-}
-
-/*
- * Find a uprobe corresponding to a given inode:offset
- * Acquires uprobes_treelock
- */
-static struct uprobe *find_uprobe(struct inode *inode, loff_t offset)
-{
-       struct uprobe *uprobe;
+       lockdep_assert(srcu_read_lock_held(&uprobes_srcu));
 
        read_lock(&uprobes_treelock);
-       uprobe = __find_uprobe(inode, offset);
+       node = rb_find(&key, &uprobes_tree, __uprobe_cmp_key);
        read_unlock(&uprobes_treelock);
 
-       return uprobe;
+       return node ? __node_2_uprobe(node) : NULL;
 }
 
 /*
@@ -1073,10 +1077,10 @@ register_for_each_vma(struct uprobe *uprobe, struct 
uprobe_consumer *new)
                        goto free;
                /*
                 * We take mmap_lock for writing to avoid the race with
-                * find_active_uprobe() which takes mmap_lock for reading.
+                * find_active_uprobe_rcu() which takes mmap_lock for reading.
                 * Thus this install_breakpoint() can not make
-                * is_trap_at_addr() true right after find_uprobe()
-                * returns NULL in find_active_uprobe().
+                * is_trap_at_addr() true right after find_uprobe_rcu()
+                * returns NULL in find_active_uprobe_rcu().
                 */
                mmap_write_lock(mm);
                vma = find_vma(mm, info->vaddr);
@@ -1885,9 +1889,13 @@ static void prepare_uretprobe(struct uprobe *uprobe, 
struct pt_regs *regs)
                return;
        }
 
+       /* we need to bump refcount to store uprobe in utask */
+       if (!try_get_uprobe(uprobe))
+               return;
+
        ri = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
        if (!ri)
-               return;
+               goto fail;
 
        trampoline_vaddr = uprobe_get_trampoline_vaddr();
        orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, 
regs);
@@ -1914,11 +1922,7 @@ static void prepare_uretprobe(struct uprobe *uprobe, 
struct pt_regs *regs)
                }
                orig_ret_vaddr = utask->return_instances->orig_ret_vaddr;
        }
-        /*
-         * uprobe's refcnt is positive, held by caller, so it's safe to
-         * unconditionally bump it one more time here
-         */
-       ri->uprobe = get_uprobe(uprobe);
+       ri->uprobe = uprobe;
        ri->func = instruction_pointer(regs);
        ri->stack = user_stack_pointer(regs);
        ri->orig_ret_vaddr = orig_ret_vaddr;
@@ -1929,8 +1933,9 @@ static void prepare_uretprobe(struct uprobe *uprobe, 
struct pt_regs *regs)
        utask->return_instances = ri;
 
        return;
- fail:
+fail:
        kfree(ri);
+       put_uprobe(uprobe);
 }
 
 /* Prepare to single-step probed instruction out of line. */
@@ -1945,9 +1950,14 @@ pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, 
unsigned long bp_vaddr)
        if (!utask)
                return -ENOMEM;
 
+       if (!try_get_uprobe(uprobe))
+               return -EINVAL;
+
        xol_vaddr = xol_get_insn_slot(uprobe);
-       if (!xol_vaddr)
-               return -ENOMEM;
+       if (!xol_vaddr) {
+               err = -ENOMEM;
+               goto err_out;
+       }
 
        utask->xol_vaddr = xol_vaddr;
        utask->vaddr = bp_vaddr;
@@ -1955,12 +1965,15 @@ pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, 
unsigned long bp_vaddr)
        err = arch_uprobe_pre_xol(&uprobe->arch, regs);
        if (unlikely(err)) {
                xol_free_insn_slot(current);
-               return err;
+               goto err_out;
        }
 
        utask->active_uprobe = uprobe;
        utask->state = UTASK_SSTEP;
        return 0;
+err_out:
+       put_uprobe(uprobe);
+       return err;
 }
 
 /*
@@ -2044,7 +2057,8 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned 
long vaddr)
        return is_trap_insn(&opcode);
 }
 
-static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
+/* assumes being inside RCU protected region */
+static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int 
*is_swbp)
 {
        struct mm_struct *mm = current->mm;
        struct uprobe *uprobe = NULL;
@@ -2057,7 +2071,7 @@ static struct uprobe *find_active_uprobe(unsigned long 
bp_vaddr, int *is_swbp)
                        struct inode *inode = file_inode(vma->vm_file);
                        loff_t offset = vaddr_to_offset(vma, bp_vaddr);
 
-                       uprobe = find_uprobe(inode, offset);
+                       uprobe = find_uprobe_rcu(inode, offset);
                }
 
                if (!uprobe)
@@ -2201,13 +2215,15 @@ static void handle_swbp(struct pt_regs *regs)
 {
        struct uprobe *uprobe;
        unsigned long bp_vaddr;
-       int is_swbp;
+       int is_swbp, srcu_idx;
 
        bp_vaddr = uprobe_get_swbp_addr(regs);
        if (bp_vaddr == uprobe_get_trampoline_vaddr())
                return uprobe_handle_trampoline(regs);
 
-       uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
+       srcu_idx = srcu_read_lock(&uprobes_srcu);
+
+       uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp);
        if (!uprobe) {
                if (is_swbp > 0) {
                        /* No matching uprobe; signal SIGTRAP. */
@@ -2223,6 +2239,7 @@ static void handle_swbp(struct pt_regs *regs)
                         */
                        instruction_pointer_set(regs, bp_vaddr);
                }
+               srcu_read_unlock(&uprobes_srcu, srcu_idx);
                return;
        }
 
@@ -2258,12 +2275,12 @@ static void handle_swbp(struct pt_regs *regs)
        if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
                goto out;
 
-       if (!pre_ssout(uprobe, regs, bp_vaddr))
-               return;
+       if (pre_ssout(uprobe, regs, bp_vaddr))
+               goto out;
 
-       /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */
 out:
-       put_uprobe(uprobe);
+       /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */
+       srcu_read_unlock(&uprobes_srcu, srcu_idx);
 }
 
 /*
-- 
2.43.0


Reply via email to