On Thu, Sep 17, 2015 at 04:35:27PM +0200, Peter Zijlstra wrote:
> I'd be happy to fail a CPU down for user tasks where this is the last
> runnable CPU of.

A little like so. Completely untested.

---
Subject: sched: Refuse to unplug a CPU if this will violate user task affinity

Its bad policy to allow unplugging a CPU for which a user set explicit
affinity, either strictly on this CPU or in case this was the last
online CPU in its mask.

Either would end up forcing the thread on a random other CPU, violating
the sys_sched_setaffinity() constraint.

Disallow this by default; root might not be aware of all user
affinities, but can negotiate and change affinities for all tasks.

Provide a sysctl to go back to the old behaviour.

Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
 include/linux/sched/sysctl.h |  1 +
 kernel/sched/core.c          | 46 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/sysctl.c              |  9 +++++++++
 3 files changed, 56 insertions(+)

diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index c9e4731cf10b..9444b549914b 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -39,6 +39,7 @@ extern unsigned int sysctl_sched_latency;
 extern unsigned int sysctl_sched_min_granularity;
 extern unsigned int sysctl_sched_wakeup_granularity;
 extern unsigned int sysctl_sched_child_runs_first;
+extern unsigned int sysctl_sched_strict_affinity;
 
 enum sched_tunable_scaling {
        SCHED_TUNABLESCALING_NONE,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 6ab415aa15c4..457c8b912fc6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -284,6 +284,11 @@ __read_mostly int scheduler_running;
  */
 int sysctl_sched_rt_runtime = 950000;
 
+/*
+ * Disallows cpu unplug if that would result in a task without runnable CPUs.
+ */
+unsigned int sysctl_sched_strict_affinity = 1;
+
 /* cpus with isolated domains */
 cpumask_var_t cpu_isolated_map;
 
@@ -5430,6 +5435,42 @@ static void set_rq_offline(struct rq *rq)
 }
 
 /*
+ * Test if there's a user task for which @cpu is the last runnable CPU
+ */
+static bool migration_possible(int cpu)
+{
+       struct task_struct *g, *p;
+       bool ret = true;
+       int next;
+
+       read_lock(&tasklist_lock);
+       for_each_process_thread(g, p) {
+               /* if its running elsewhere, this cannot be its last cpu */
+               if (task_cpu(p) != cpu)
+                       continue;
+
+               /* we only care about user state */
+               if (p->flags & PF_KTHREAD)
+                       continue;
+
+               next = -1;
+again:
+               next = cpumask_next_and(next, tsk_cpus_allowed(p), 
cpu_active_mask);
+               if (next >= nr_cpu_ids) {
+                       printk(KERN_WARNING "task %s-%d refused unplug of CPU 
%d\n",
+                                       p->comm, p->pid, cpu);
+                       ret = false;
+                       break;
+               }
+               if (next == cpu)
+                       goto again;
+       }
+       read_unlock(&tasklist_lock);
+
+       return ret;
+}
+
+/*
  * migration_call - callback that gets triggered when a CPU is added.
  * Here we can start up the necessary migration thread for the new CPU.
  */
@@ -5440,6 +5481,11 @@ migration_call(struct notifier_block *nfb, unsigned long 
action, void *hcpu)
        unsigned long flags;
        struct rq *rq = cpu_rq(cpu);
 
+       if (action == CPU_DOWN_PREPARE && sysctl_sched_strict_affinity) {
+               if (!migration_possible(cpu))
+                       return notifier_from_errno(-EBUSY);
+       }
+
        switch (action & ~CPU_TASKS_FROZEN) {
 
        case CPU_UP_PREPARE:
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index e69201d8094e..9d0edcc73cc3 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -283,6 +283,15 @@ static struct ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_dointvec,
        },
+#ifdef CONFIG_SMP
+       {
+               .procname       = "sched_strict_affinity",
+               .data           = &sysctl_sched_strict_affinity,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec,
+       },
+#endif /* CONFIG_SMP */
 #ifdef CONFIG_SCHED_DEBUG
        {
                .procname       = "sched_min_granularity_ns",
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to