Automatically enable the rcu_normal_wake_from_gp parameter on systems with a small number of CPUs. The activation threshold is set to 16 CPUs.
This helps to reduce a latency of normal synchronize_rcu() API by waking up GP-waiters earlier and decoupling synchronize_rcu() callers from regular callback handling. A benchmark running 64 parallel jobs invoking synchronize_rcu() demonstrates a notable latency reduction with the setting enabled. Latency distribution (microseconds): <default> 0 - 9999 : 1 10000 - 19999 : 4 20000 - 29999 : 399 30000 - 39999 : 3197 40000 - 49999 : 10428 50000 - 59999 : 17363 60000 - 69999 : 15529 70000 - 79999 : 9287 80000 - 89999 : 4249 90000 - 99999 : 1915 100000 - 109999 : 922 110000 - 119999 : 390 120000 - 129999 : 187 ... <default> <rcu_normal_wake_from_gp> 0 - 9999 : 1 10000 - 19999 : 234 20000 - 29999 : 6678 30000 - 39999 : 33463 40000 - 49999 : 20669 50000 - 59999 : 2766 60000 - 69999 : 183 ... <rcu_normal_wake_from_gp> Signed-off-by: Uladzislau Rezki (Sony) <ure...@gmail.com> --- kernel/rcu/tree.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 14d4499c6fc3..c0e0b38a08dc 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1625,7 +1625,9 @@ static void rcu_sr_put_wait_head(struct llist_node *node) atomic_set_release(&sr_wn->inuse, 0); } -/* Disabled by default. */ +/* Enable rcu_normal_wake_from_gp automatically on small systems. */ +#define WAKE_FROM_GP_CPU_THRESHOLD 16 + static int rcu_normal_wake_from_gp; module_param(rcu_normal_wake_from_gp, int, 0644); static struct workqueue_struct *sync_wq; @@ -4847,6 +4849,9 @@ void __init rcu_init(void) sync_wq = alloc_workqueue("sync_wq", WQ_MEM_RECLAIM, 0); WARN_ON(!sync_wq); + if (num_possible_cpus() <= WAKE_FROM_GP_CPU_THRESHOLD) + WRITE_ONCE(rcu_normal_wake_from_gp, 1); + /* Fill in default value for rcutree.qovld boot parameter. */ /* -After- the rcu_node ->lock fields are initialized! */ if (qovld < 0) -- 2.39.5