[PATCH v2] uprobes: reduce contention on uprobes_tree access

2024-04-22 Thread Jonathan Haslam
Active uprobes are stored in an RB tree and accesses to this tree are
dominated by read operations. Currently these accesses are serialized by
a spinlock but this leads to enormous contention when large numbers of
threads are executing active probes.

This patch converts the spinlock used to serialize access to the
uprobes_tree RB tree into a reader-writer spinlock. This lock type
aligns naturally with the overwhelmingly read-only nature of the tree
usage here. Although the addition of reader-writer spinlocks are
discouraged [0], this fix is proposed as an interim solution while an
RCU based approach is implemented (that work is in a nascent form). This
fix also has the benefit of being trivial, self contained and therefore
simple to backport.

We have used a uprobe benchmark from the BPF selftests [1] to estimate
the improvements. Each block of results below show 1 line per execution
of the benchmark ("the "Summary" line) and each line is a run with one
more thread added - a thread is a "producer". The lines are edited to
remove extraneous output.

The tests were executed with this driver script:

for num_threads in {1..20}
do
  sudo ./bench -a -p $num_threads trig-uprobe-nop | grep Summary
done

SPINLOCK (BEFORE)
==
Summary: hits1.396 ± 0.007M/s (  1.396M/prod)
Summary: hits1.656 ± 0.016M/s (  0.828M/prod)
Summary: hits2.246 ± 0.008M/s (  0.749M/prod)
Summary: hits2.114 ± 0.010M/s (  0.529M/prod)
Summary: hits2.013 ± 0.009M/s (  0.403M/prod)
Summary: hits1.753 ± 0.008M/s (  0.292M/prod)
Summary: hits1.847 ± 0.001M/s (  0.264M/prod)
Summary: hits1.889 ± 0.001M/s (  0.236M/prod)
Summary: hits1.833 ± 0.006M/s (  0.204M/prod)
Summary: hits1.900 ± 0.003M/s (  0.190M/prod)
Summary: hits1.918 ± 0.006M/s (  0.174M/prod)
Summary: hits1.925 ± 0.002M/s (  0.160M/prod)
Summary: hits1.837 ± 0.001M/s (  0.141M/prod)
Summary: hits1.898 ± 0.001M/s (  0.136M/prod)
Summary: hits1.799 ± 0.016M/s (  0.120M/prod)
Summary: hits1.850 ± 0.005M/s (  0.109M/prod)
Summary: hits1.816 ± 0.002M/s (  0.101M/prod)
Summary: hits1.787 ± 0.001M/s (  0.094M/prod)
Summary: hits1.764 ± 0.002M/s (  0.088M/prod)

RW SPINLOCK (AFTER)
===
Summary: hits1.444 ± 0.020M/s (  1.444M/prod)
Summary: hits2.279 ± 0.011M/s (  1.139M/prod)
Summary: hits3.422 ± 0.014M/s (  1.141M/prod)
Summary: hits3.565 ± 0.017M/s (  0.891M/prod)
Summary: hits2.671 ± 0.013M/s (  0.534M/prod)
Summary: hits2.409 ± 0.005M/s (  0.401M/prod)
Summary: hits2.485 ± 0.008M/s (  0.355M/prod)
Summary: hits2.496 ± 0.003M/s (  0.312M/prod)
Summary: hits2.585 ± 0.002M/s (  0.287M/prod)
Summary: hits2.908 ± 0.011M/s (  0.291M/prod)
Summary: hits2.346 ± 0.016M/s (  0.213M/prod)
Summary: hits2.804 ± 0.004M/s (  0.234M/prod)
Summary: hits2.556 ± 0.001M/s (  0.197M/prod)
Summary: hits2.754 ± 0.004M/s (  0.197M/prod)
Summary: hits2.482 ± 0.002M/s (  0.165M/prod)
Summary: hits2.412 ± 0.005M/s (  0.151M/prod)
Summary: hits2.710 ± 0.003M/s (  0.159M/prod)
Summary: hits2.826 ± 0.005M/s (  0.157M/prod)
Summary: hits2.718 ± 0.001M/s (  0.143M/prod)
Summary: hits2.844 ± 0.006M/s (  0.142M/prod)

The numbers in parenthesis give averaged throughput per thread which is
of greatest interest here as a measure of scalability. Improvements are
in the order of 22 - 68% with this particular benchmark (mean = 43%).

V2:
 - Updated commit message to include benchmark results.

[0] https://docs.kernel.org/locking/spinlocks.html
[1] 
https://github.com/torvalds/linux/blob/master/tools/testing/selftests/bpf/benchs/bench_trigger.c

Signed-off-by: Jonathan Haslam 
---
 kernel/events/uprobes.c | 22 +++---
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index e4834d23e1d1..8ae0eefc3a34 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -39,7 +39,7 @@ static struct rb_root uprobes_tree = RB_ROOT;
  */
 #define no_uprobe_events() RB_EMPTY_ROOT(_tree)
 
-static DEFINE_SPINLOCK(uprobes_treelock);  /* serialize rbtree access */
+static DEFINE_RWLOCK(uprobes_treelock);/* serialize rbtree access */
 
 #define UPROBES_HASH_SZ13
 /* serialize uprobe->pending_list */
@@ -669,9 +669,9 @@ static struct uprobe *find_uprobe(struct inode *inode, 
loff_t offset)
 {
struct uprobe *uprobe;
 
-   spin_lock(_treelock);
+   read_lock(_treelock);
uprobe = __find_uprobe(inode, offset);
-   spin_unlock(_treelock);
+   read_unlock(_treelock);
 
return uprobe;
 }
@@ -701,9 +701,9 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe)
 {
struct uprobe *u;
 
-   spin_lock(_treelock);
+   write_lock(_treelock);
u = __insert_uprobe(uprobe);
-   spin_unlock(_treelock);
+   write_unlock(_treelock);
 
return u;
 }
@@ 

[PATCH] uprobes: reduce contention on uprobes_tree access

2024-03-21 Thread Jonathan Haslam
Active uprobes are stored in an RB tree and accesses to this tree are
dominated by read operations. Currently these accesses are serialized by
a spinlock but this leads to enormous contention when large numbers of
threads are executing active probes.

This patch converts the spinlock used to serialize access to the
uprobes_tree RB tree into a reader-writer spinlock. This lock type
aligns naturally with the overwhelmingly read-only nature of the tree
usage here. Although the addition of reader-writer spinlocks are
discouraged [0], this fix is proposed as an interim solution while an
RCU based approach is implemented (that work is in a nascent form). This
fix also has the benefit of being trivial, self contained and therefore
simple to backport.

This change has been tested against production workloads that exhibit
significant contention on the spinlock and an almost order of magnitude
reduction for mean uprobe execution time is observed (28 -> 3.5 microsecs).

[0] https://docs.kernel.org/locking/spinlocks.html

Signed-off-by: Jonathan Haslam 
---
 kernel/events/uprobes.c | 22 +++---
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 929e98c62965..42bf9b6e8bc0 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -39,7 +39,7 @@ static struct rb_root uprobes_tree = RB_ROOT;
  */
 #define no_uprobe_events() RB_EMPTY_ROOT(_tree)
 
-static DEFINE_SPINLOCK(uprobes_treelock);  /* serialize rbtree access */
+static DEFINE_RWLOCK(uprobes_treelock);/* serialize rbtree access */
 
 #define UPROBES_HASH_SZ13
 /* serialize uprobe->pending_list */
@@ -669,9 +669,9 @@ static struct uprobe *find_uprobe(struct inode *inode, 
loff_t offset)
 {
struct uprobe *uprobe;
 
-   spin_lock(_treelock);
+   read_lock(_treelock);
uprobe = __find_uprobe(inode, offset);
-   spin_unlock(_treelock);
+   read_unlock(_treelock);
 
return uprobe;
 }
@@ -701,9 +701,9 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe)
 {
struct uprobe *u;
 
-   spin_lock(_treelock);
+   write_lock(_treelock);
u = __insert_uprobe(uprobe);
-   spin_unlock(_treelock);
+   write_unlock(_treelock);
 
return u;
 }
@@ -935,9 +935,9 @@ static void delete_uprobe(struct uprobe *uprobe)
if (WARN_ON(!uprobe_is_active(uprobe)))
return;
 
-   spin_lock(_treelock);
+   write_lock(_treelock);
rb_erase(>rb_node, _tree);
-   spin_unlock(_treelock);
+   write_unlock(_treelock);
RB_CLEAR_NODE(>rb_node); /* for uprobe_is_active() */
put_uprobe(uprobe);
 }
@@ -1298,7 +1298,7 @@ static void build_probe_list(struct inode *inode,
min = vaddr_to_offset(vma, start);
max = min + (end - start) - 1;
 
-   spin_lock(_treelock);
+   read_lock(_treelock);
n = find_node_in_range(inode, min, max);
if (n) {
for (t = n; t; t = rb_prev(t)) {
@@ -1316,7 +1316,7 @@ static void build_probe_list(struct inode *inode,
get_uprobe(u);
}
}
-   spin_unlock(_treelock);
+   read_unlock(_treelock);
 }
 
 /* @vma contains reference counter, not the probed instruction. */
@@ -1407,9 +1407,9 @@ vma_has_uprobes(struct vm_area_struct *vma, unsigned long 
start, unsigned long e
min = vaddr_to_offset(vma, start);
max = min + (end - start) - 1;
 
-   spin_lock(_treelock);
+   read_lock(_treelock);
n = find_node_in_range(inode, min, max);
-   spin_unlock(_treelock);
+   read_unlock(_treelock);
 
return !!n;
 }
-- 
2.43.0