On Wed, Mar 20, 2019 at 10:13:33PM +0100, Sebastian Andrzej Siewior wrote:
> Running RCU out of softirq is a problem for some workloads that would
> like to manage RCU core processing independently of other softirq
> work, for example, setting kthread priority.  This commit therefore
> introduces the `rcunosoftirq' option which moves the RCU core work
> from softirq to a per-CPU/per-flavor SCHED_OTHER kthread named rcuc.
> The SCHED_OTHER approach avoids the scalability problems that appeared
> with the earlier attempt to move RCU core processing to from softirq
> to kthreads.  That said, kernels built with RCU_BOOST=y will run the
> rcuc kthreads at the RCU-boosting priority.
> 
> Reported-by: Thomas Gleixner <t...@linutronix.de>
> Tested-by: Mike Galbraith <efa...@gmx.de>
> Signed-off-by: Sebastian Andrzej Siewior <bige...@linutronix.de>

Thank you!  I reverted v2 and applied this one with the same sort of
update.  Testing is going well thus far aside from my failing to add
the required "=0" after the rcutree.use_softirq.  I will probably not
be the only one who will run afoul of this, so I updated the commit log
and the documentation accordingly, as shown below.

                                                        Thanx, Paul

------------------------------------------------------------------------

commit 5971694b716d34baa86f3f1dd44f8e587a17d8f0
Author: Sebastian Andrzej Siewior <bige...@linutronix.de>
Date:   Wed Mar 20 22:13:33 2019 +0100

    rcu: Enable elimination of Tree-RCU softirq processing
    
    Some workloads need to change kthread priority for RCU core processing
    without affecting other softirq work.  This commit therefore introduces
    the rcutree.use_softirq kernel boot parameter, which moves the RCU core
    work from softirq to a per-CPU SCHED_OTHER kthread named rcuc.  Use of
    SCHED_OTHER approach avoids the scalability problems that appeared
    with the earlier attempt to move RCU core processing to from softirq
    to kthreads.  That said, kernels built with RCU_BOOST=y will run the
    rcuc kthreads at the RCU-boosting priority.
    
    Note that rcutree.use_softirq=0 must be specified to move RCU core
    processing to the rcuc kthreads: rcutree.use_softirq=1 is the default.
    
    Reported-by: Thomas Gleixner <t...@linutronix.de>
    Tested-by: Mike Galbraith <efa...@gmx.de>
    Signed-off-by: Sebastian Andrzej Siewior <bige...@linutronix.de>
    Signed-off-by: Paul E. McKenney <paul...@linux.ibm.com>

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index d377a2166b79..e2ffb1d9de03 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3672,6 +3672,12 @@
                        the propagation of recent CPU-hotplug changes up
                        the rcu_node combining tree.
 
+       rcutree.use_softirq=    [KNL]
+                       If set to zero, move all RCU_SOFTIRQ processing to
+                       per-CPU rcuc kthreads.  Defaults to a non-zero
+                       value, meaning that RCU_SOFTIRQ is used by default.
+                       Specify rcutree.use_softirq=0 to use rcuc kthreads.
+
        rcutree.rcu_fanout_exact= [KNL]
                        Disable autobalancing of the rcu_node combining
                        tree.  This is used by rcutorture, and might
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index ec77ec336f58..6bd05c9918cc 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -51,6 +51,12 @@
 #include <linux/tick.h>
 #include <linux/sysrq.h>
 #include <linux/kprobes.h>
+#include <linux/gfp.h>
+#include <linux/oom.h>
+#include <linux/smpboot.h>
+#include <linux/jiffies.h>
+#include <linux/sched/isolation.h>
+#include "../time/tick-internal.h"
 
 #include "tree.h"
 #include "rcu.h"
@@ -92,6 +98,9 @@ struct rcu_state rcu_state = {
 /* Dump rcu_node combining tree at boot to verify correct setup. */
 static bool dump_tree;
 module_param(dump_tree, bool, 0444);
+/* Move RCU_SOFTIRQ to rcuc kthreads. */
+static bool use_softirq = 1;
+module_param(use_softirq, bool, 0444);
 /* Control rcu_node-tree auto-balancing at boot time. */
 static bool rcu_fanout_exact;
 module_param(rcu_fanout_exact, bool, 0444);
@@ -2253,7 +2262,7 @@ void rcu_force_quiescent_state(void)
 EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
 
 /* Perform RCU core processing work for the current CPU.  */
-static __latent_entropy void rcu_core(struct softirq_action *unused)
+static __latent_entropy void rcu_core(void)
 {
        unsigned long flags;
        struct rcu_data *rdp = raw_cpu_ptr(&rcu_data);
@@ -2295,6 +2304,34 @@ static __latent_entropy void rcu_core(struct 
softirq_action *unused)
        trace_rcu_utilization(TPS("End RCU core"));
 }
 
+static void rcu_core_si(struct softirq_action *h)
+{
+       rcu_core();
+}
+
+static void rcu_wake_cond(struct task_struct *t, int status)
+{
+       /*
+        * If the thread is yielding, only wake it when this
+        * is invoked from idle
+        */
+       if (t && (status != RCU_KTHREAD_YIELDING || is_idle_task(current)))
+               wake_up_process(t);
+}
+
+static void invoke_rcu_core_kthread(void)
+{
+       struct task_struct *t;
+       unsigned long flags;
+
+       local_irq_save(flags);
+       __this_cpu_write(rcu_data.rcu_cpu_has_work, 1);
+       t = __this_cpu_read(rcu_data.rcu_cpu_kthread_task);
+       if (t != NULL && t != current)
+               rcu_wake_cond(t, 
__this_cpu_read(rcu_data.rcu_cpu_kthread_status));
+       local_irq_restore(flags);
+}
+
 /*
  * Schedule RCU callback invocation.  If the running implementation of RCU
  * does not support RCU priority boosting, just do a direct call, otherwise
@@ -2306,18 +2343,94 @@ static void invoke_rcu_callbacks(struct rcu_data *rdp)
 {
        if (unlikely(!READ_ONCE(rcu_scheduler_fully_active)))
                return;
-       if (likely(!rcu_state.boost)) {
-               rcu_do_batch(rdp);
-               return;
-       }
-       invoke_rcu_callbacks_kthread();
+       if (rcu_state.boost || !use_softirq)
+               invoke_rcu_core_kthread();
+       rcu_do_batch(rdp);
 }
 
+/*
+ * Wake up this CPU's rcuc kthread to do RCU core processing.
+ */
 static void invoke_rcu_core(void)
 {
-       if (cpu_online(smp_processor_id()))
+       if (!cpu_online(smp_processor_id()))
+               return;
+       if (use_softirq)
                raise_softirq(RCU_SOFTIRQ);
+       else
+               invoke_rcu_core_kthread();
+}
+
+static void rcu_cpu_kthread_park(unsigned int cpu)
+{
+       per_cpu(rcu_data.rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
+}
+
+static int rcu_cpu_kthread_should_run(unsigned int cpu)
+{
+       return __this_cpu_read(rcu_data.rcu_cpu_has_work);
+}
+
+/*
+ * Per-CPU kernel thread that invokes RCU callbacks.  This replaces
+ * the RCU softirq used in configurations of RCU that do not support RCU
+ * priority boosting.
+ */
+static void rcu_cpu_kthread(unsigned int cpu)
+{
+       unsigned int *statusp = this_cpu_ptr(&rcu_data.rcu_cpu_kthread_status);
+       char work, *workp = this_cpu_ptr(&rcu_data.rcu_cpu_has_work);
+       int spincnt;
+
+       for (spincnt = 0; spincnt < 10; spincnt++) {
+               trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
+               local_bh_disable();
+               *statusp = RCU_KTHREAD_RUNNING;
+               local_irq_disable();
+               work = *workp;
+               *workp = 0;
+               local_irq_enable();
+               if (work)
+                       rcu_core();
+               local_bh_enable();
+               if (*workp == 0) {
+                       trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
+                       *statusp = RCU_KTHREAD_WAITING;
+                       return;
+               }
+       }
+       *statusp = RCU_KTHREAD_YIELDING;
+       trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
+       schedule_timeout_interruptible(2);
+       trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
+       *statusp = RCU_KTHREAD_WAITING;
+}
+
+static struct smp_hotplug_thread rcu_cpu_thread_spec = {
+       .store                  = &rcu_data.rcu_cpu_kthread_task,
+       .thread_should_run      = rcu_cpu_kthread_should_run,
+       .thread_fn              = rcu_cpu_kthread,
+       .thread_comm            = "rcuc/%u",
+       .setup                  = rcu_cpu_kthread_setup,
+       .park                   = rcu_cpu_kthread_park,
+};
+
+/*
+ * Spawn per-CPU RCU core processing kthreads.
+ */
+static int __init rcu_spawn_core_kthreads(void)
+{
+       int cpu;
+
+       for_each_possible_cpu(cpu)
+               per_cpu(rcu_data.rcu_cpu_has_work, cpu) = 0;
+       if (!IS_ENABLED(CONFIG_RCU_BOOST) && use_softirq)
+               return 0;
+       WARN_ONCE(smpboot_register_percpu_thread(&rcu_cpu_thread_spec),
+                 "%s: Could not start rcuc kthread, OOM is now expected 
behavior\n", __func__);
+       return 0;
 }
+early_initcall(rcu_spawn_core_kthreads);
 
 /*
  * Handle any core-RCU processing required by a call_rcu() invocation.
@@ -3355,7 +3468,8 @@ void __init rcu_init(void)
        rcu_init_one();
        if (dump_tree)
                rcu_dump_rcu_node_tree();
-       open_softirq(RCU_SOFTIRQ, rcu_core);
+       if (use_softirq)
+               open_softirq(RCU_SOFTIRQ, rcu_core_si);
 
        /*
         * We don't need protection against CPU-hotplug here because
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index e253d11af3c4..a1a72a1ecb02 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -407,8 +407,8 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func);
 static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck);
 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
 static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
-static void invoke_rcu_callbacks_kthread(void);
 static bool rcu_is_callbacks_kthread(void);
+static void rcu_cpu_kthread_setup(unsigned int cpu);
 static void __init rcu_spawn_boost_kthreads(void);
 static void rcu_prepare_kthreads(int cpu);
 static void rcu_cleanup_after_idle(void);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index f46b4af96ab9..b807204ffd83 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -11,29 +11,7 @@
  *        Paul E. McKenney <paul...@linux.ibm.com>
  */
 
-#include <linux/delay.h>
-#include <linux/gfp.h>
-#include <linux/oom.h>
-#include <linux/sched/debug.h>
-#include <linux/smpboot.h>
-#include <linux/sched/isolation.h>
-#include <uapi/linux/sched/types.h>
-#include "../time/tick-internal.h"
-
-#ifdef CONFIG_RCU_BOOST
 #include "../locking/rtmutex_common.h"
-#else /* #ifdef CONFIG_RCU_BOOST */
-
-/*
- * Some architectures do not define rt_mutexes, but if !CONFIG_RCU_BOOST,
- * all uses are in dead code.  Provide a definition to keep the compiler
- * happy, but add WARN_ON_ONCE() to complain if used in the wrong place.
- * This probably needs to be excluded from -rt builds.
- */
-#define rt_mutex_owner(a) ({ WARN_ON_ONCE(1); NULL; })
-#define rt_mutex_futex_unlock(x) WARN_ON_ONCE(1)
-
-#endif /* #else #ifdef CONFIG_RCU_BOOST */
 
 #ifdef CONFIG_RCU_NOCB_CPU
 static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
@@ -94,6 +72,8 @@ static void __init rcu_bootup_announce_oddness(void)
                pr_info("\tRCU debug GP init slowdown %d jiffies.\n", 
gp_init_delay);
        if (gp_cleanup_delay)
                pr_info("\tRCU debug GP init slowdown %d jiffies.\n", 
gp_cleanup_delay);
+       if (!use_softirq)
+               pr_info("\tRCU_SOFTIRQ processing moved to rcuc kthreads.\n");
        if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG))
                pr_info("\tRCU debug extended QS entry/exit.\n");
        rcupdate_announce_bootup_oddness();
@@ -629,7 +609,10 @@ static void rcu_read_unlock_special(struct task_struct *t)
                /* Need to defer quiescent state until everything is enabled. */
                if (irqs_were_disabled) {
                        /* Enabling irqs does not reschedule, so... */
-                       raise_softirq_irqoff(RCU_SOFTIRQ);
+                       if (!use_softirq)
+                               raise_softirq_irqoff(RCU_SOFTIRQ);
+                       else
+                               invoke_rcu_core();
                } else {
                        /* Enabling BH or preempt does reschedule, so... */
                        set_tsk_need_resched(current);
@@ -944,18 +927,21 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
 
 #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
 
+/*
+ * If boosting, set rcuc kthreads to realtime priority.
+ */
+static void rcu_cpu_kthread_setup(unsigned int cpu)
+{
 #ifdef CONFIG_RCU_BOOST
+       struct sched_param sp;
 
-static void rcu_wake_cond(struct task_struct *t, int status)
-{
-       /*
-        * If the thread is yielding, only wake it when this
-        * is invoked from idle
-        */
-       if (status != RCU_KTHREAD_YIELDING || is_idle_task(current))
-               wake_up_process(t);
+       sp.sched_priority = kthread_prio;
+       sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
+#endif /* #ifdef CONFIG_RCU_BOOST */
 }
 
+#ifdef CONFIG_RCU_BOOST
+
 /*
  * Carry out RCU priority boosting on the task indicated by ->exp_tasks
  * or ->boost_tasks, advancing the pointer to the next task in the
@@ -1093,23 +1079,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, 
unsigned long flags)
        }
 }
 
-/*
- * Wake up the per-CPU kthread to invoke RCU callbacks.
- */
-static void invoke_rcu_callbacks_kthread(void)
-{
-       unsigned long flags;
-
-       local_irq_save(flags);
-       __this_cpu_write(rcu_data.rcu_cpu_has_work, 1);
-       if (__this_cpu_read(rcu_data.rcu_cpu_kthread_task) != NULL &&
-           current != __this_cpu_read(rcu_data.rcu_cpu_kthread_task)) {
-               rcu_wake_cond(__this_cpu_read(rcu_data.rcu_cpu_kthread_task),
-                             __this_cpu_read(rcu_data.rcu_cpu_kthread_status));
-       }
-       local_irq_restore(flags);
-}
-
 /*
  * Is the current CPU running the RCU-callbacks kthread?
  * Caller must have preemption disabled.
@@ -1163,59 +1132,6 @@ static int rcu_spawn_one_boost_kthread(struct rcu_node 
*rnp)
        return 0;
 }
 
-static void rcu_cpu_kthread_setup(unsigned int cpu)
-{
-       struct sched_param sp;
-
-       sp.sched_priority = kthread_prio;
-       sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
-}
-
-static void rcu_cpu_kthread_park(unsigned int cpu)
-{
-       per_cpu(rcu_data.rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
-}
-
-static int rcu_cpu_kthread_should_run(unsigned int cpu)
-{
-       return __this_cpu_read(rcu_data.rcu_cpu_has_work);
-}
-
-/*
- * Per-CPU kernel thread that invokes RCU callbacks.  This replaces
- * the RCU softirq used in configurations of RCU that do not support RCU
- * priority boosting.
- */
-static void rcu_cpu_kthread(unsigned int cpu)
-{
-       unsigned int *statusp = this_cpu_ptr(&rcu_data.rcu_cpu_kthread_status);
-       char work, *workp = this_cpu_ptr(&rcu_data.rcu_cpu_has_work);
-       int spincnt;
-
-       for (spincnt = 0; spincnt < 10; spincnt++) {
-               trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
-               local_bh_disable();
-               *statusp = RCU_KTHREAD_RUNNING;
-               local_irq_disable();
-               work = *workp;
-               *workp = 0;
-               local_irq_enable();
-               if (work)
-                       rcu_do_batch(this_cpu_ptr(&rcu_data));
-               local_bh_enable();
-               if (*workp == 0) {
-                       trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
-                       *statusp = RCU_KTHREAD_WAITING;
-                       return;
-               }
-       }
-       *statusp = RCU_KTHREAD_YIELDING;
-       trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
-       schedule_timeout_interruptible(2);
-       trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
-       *statusp = RCU_KTHREAD_WAITING;
-}
-
 /*
  * Set the per-rcu_node kthread's affinity to cover all CPUs that are
  * served by the rcu_node in question.  The CPU hotplug lock is still
@@ -1246,27 +1162,13 @@ static void rcu_boost_kthread_setaffinity(struct 
rcu_node *rnp, int outgoingcpu)
        free_cpumask_var(cm);
 }
 
-static struct smp_hotplug_thread rcu_cpu_thread_spec = {
-       .store                  = &rcu_data.rcu_cpu_kthread_task,
-       .thread_should_run      = rcu_cpu_kthread_should_run,
-       .thread_fn              = rcu_cpu_kthread,
-       .thread_comm            = "rcuc/%u",
-       .setup                  = rcu_cpu_kthread_setup,
-       .park                   = rcu_cpu_kthread_park,
-};
-
 /*
  * Spawn boost kthreads -- called as soon as the scheduler is running.
  */
 static void __init rcu_spawn_boost_kthreads(void)
 {
        struct rcu_node *rnp;
-       int cpu;
 
-       for_each_possible_cpu(cpu)
-               per_cpu(rcu_data.rcu_cpu_has_work, cpu) = 0;
-       if (WARN_ONCE(smpboot_register_percpu_thread(&rcu_cpu_thread_spec), 
"%s: Could not start rcub kthread, OOM is now expected behavior\n", __func__))
-               return;
        rcu_for_each_leaf_node(rnp)
                (void)rcu_spawn_one_boost_kthread(rnp);
 }
@@ -1289,11 +1191,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, 
unsigned long flags)
        raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 }
 
-static void invoke_rcu_callbacks_kthread(void)
-{
-       WARN_ON_ONCE(1);
-}
-
 static bool rcu_is_callbacks_kthread(void)
 {
        return false;

Reply via email to